Merge pull request #257 from alibaba-damo-academy/dev_wjm

update
This commit is contained in:
hnluo 2023-03-17 15:18:10 +08:00 committed by GitHub
commit 39b175e426
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23

View File

@ -18,15 +18,11 @@ def forward_segment(text, seg_dict):
def seg_tokenize(txt, seg_dict):
out_txt = ""
pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
for word in txt:
if pattern.match(word):
if word in seg_dict:
out_txt += seg_dict[word] + " "
else:
out_txt += "<unk>" + " "
if word in seg_dict:
out_txt += seg_dict[word] + " "
else:
continue
out_txt += "<unk>" + " "
return out_txt.strip().split()
def tokenize(data,