diff --git a/funasr/datasets/large_datasets/utils/tokenize.py b/funasr/datasets/large_datasets/utils/tokenize.py index 022d32131..0d2fd84ee 100644 --- a/funasr/datasets/large_datasets/utils/tokenize.py +++ b/funasr/datasets/large_datasets/utils/tokenize.py @@ -17,13 +17,21 @@ def forward_segment(text, seg_dict): return word_list def seg_tokenize(txt, seg_dict): + pattern = re.compile(r'^[\u4E00-\u9FA50-9]+$') out_txt = "" for word in txt: word = word.lower() if word in seg_dict: out_txt += seg_dict[word] + " " else: - out_txt += "" + " " + if pattern.match(word): + for char in word: + if char in seg_dict: + out_txt += seg_dict[char] + " " + else: + out_txt += "" + " " + else: + out_txt += "" + " " return out_txt.strip().split() def tokenize(data, diff --git a/funasr/datasets/preprocessor.py b/funasr/datasets/preprocessor.py index 20a38314a..758c75045 100644 --- a/funasr/datasets/preprocessor.py +++ b/funasr/datasets/preprocessor.py @@ -44,15 +44,22 @@ def forward_segment(text, dic): i += len(longest_word) return word_list - def seg_tokenize(txt, seg_dict): + pattern = re.compile(r'^[\u4E00-\u9FA50-9]+$') out_txt = "" for word in txt: word = word.lower() if word in seg_dict: out_txt += seg_dict[word] + " " else: - out_txt += "" + " " + if pattern.match(word): + for char in word: + if char in seg_dict: + out_txt += seg_dict[char] + " " + else: + out_txt += "" + " " + else: + out_txt += "" + " " return out_txt.strip().split() def seg_tokenize_wo_pattern(txt, seg_dict):