# Copyright (c) Alibaba, Inc. and its affiliates. import string from typing import Any, List, Union def isChinese(ch: str): if '\u4e00' <= ch <= '\u9fff': return True return False def isAllChinese(word: Union[List[Any], str]): word_lists = [] for i in word: cur = i.replace(' ', '') cur = cur.replace('', '') cur = cur.replace('', '') word_lists.append(cur) if len(word_lists) == 0: return False for ch in word_lists: if isChinese(ch) is False: return False return True def isAllAlpha(word: Union[List[Any], str]): word_lists = [] for i in word: cur = i.replace(' ', '') cur = cur.replace('', '') cur = cur.replace('', '') word_lists.append(cur) if len(word_lists) == 0: return False for ch in word_lists: if ch.isalpha() is False and ch != "'": return False elif ch.isalpha() is True and isChinese(ch) is True: return False return True def abbr_dispose(words: List[Any]) -> List[Any]: words_size = len(words) word_lists = [] abbr_begin = [] abbr_end = [] last_num = -1 for num in range(words_size): if num <= last_num: continue if len(words[num]) == 1 and words[num].encode('utf-8').isalpha(): if num + 1 < words_size and words[ num + 1] == ' ' and num + 2 < words_size and len( words[num + 2]) == 1 and words[num + 2].encode('utf-8').isalpha(): # found the begin of abbr abbr_begin.append(num) num += 2 abbr_end.append(num) # to find the end of abbr while True: num += 1 if num < words_size and words[num] == ' ': num += 1 if num < words_size and len( words[num]) == 1 and words[num].encode( 'utf-8').isalpha(): abbr_end.pop() abbr_end.append(num) last_num = num else: break else: break last_num = -1 for num in range(words_size): if num <= last_num: continue if num in abbr_begin: word_lists.append(words[num].upper()) num += 1 while num < words_size: if num in abbr_end: word_lists.append(words[num].upper()) last_num = num break else: if words[num].encode('utf-8').isalpha(): word_lists.append(words[num].upper()) num += 1 else: word_lists.append(words[num]) return word_lists def sentence_postprocess(words: List[Any]): middle_lists = [] word_lists = [] word_item = '' # wash words lists for i in words: word = '' if isinstance(i, str): word = i else: word = i.decode('utf-8') if word in ['~~', '~~', '']: continue else: middle_lists.append(word) # all chinese characters if isAllChinese(middle_lists): for ch in middle_lists: word_lists.append(ch.replace(' ', '')) # all alpha characters elif isAllAlpha(middle_lists): for ch in middle_lists: word = '' if '@@' in ch: word = ch.replace('@@', '') word_item += word else: word_item += ch word_lists.append(word_item) word_lists.append(' ') word_item = '' # mix characters else: alpha_blank = False for ch in middle_lists: word = '' if isAllChinese(ch): if alpha_blank is True: word_lists.pop() word_lists.append(ch) alpha_blank = False elif '@@' in ch: word = ch.replace('@@', '') word_item += word alpha_blank = False elif isAllAlpha(ch): word_item += ch word_lists.append(word_item) word_lists.append(' ') word_item = '' alpha_blank = True else: raise ValueError('invalid character: {}'.format(ch)) word_lists = abbr_dispose(word_lists) sentence = ''.join(word_lists).strip() return sentence