fix punc model

This commit is contained in:
游雁 2024-01-13 22:35:20 +08:00
parent 49e8e9d8fc
commit ccac6ceea9

View File

@ -664,26 +664,6 @@ class CodeMixTokenizerCommonPreprocessor(CommonPreprocessor):
if self.seg_jieba:
jieba.load_userdict(seg_dict_file)
@classmethod
def split_words(cls, text: str):
words = []
segs = text.split()
for seg in segs:
# There is no space in seg.
current_word = ""
for c in seg:
if len(c.encode()) == 1:
# This is an ASCII char.
current_word += c
else:
# This is a Chinese char.
if len(current_word) > 0:
words.append(current_word)
current_word = ""
words.append(c)
if len(current_word) > 0:
words.append(current_word)
return words
@classmethod
def isEnglish(cls, text:str):