From 7aee2a6a718e9398315533d171625ee015205a6f Mon Sep 17 00:00:00 2001 From: speech_asr Date: Fri, 17 Mar 2023 15:14:18 +0800 Subject: [PATCH] update --- funasr/datasets/large_datasets/utils/tokenize.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/funasr/datasets/large_datasets/utils/tokenize.py b/funasr/datasets/large_datasets/utils/tokenize.py index caeb42626..a016e4ead 100644 --- a/funasr/datasets/large_datasets/utils/tokenize.py +++ b/funasr/datasets/large_datasets/utils/tokenize.py @@ -18,15 +18,11 @@ def forward_segment(text, seg_dict): def seg_tokenize(txt, seg_dict): out_txt = "" - pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])") for word in txt: - if pattern.match(word): - if word in seg_dict: - out_txt += seg_dict[word] + " " - else: - out_txt += "" + " " + if word in seg_dict: + out_txt += seg_dict[word] + " " else: - continue + out_txt += "" + " " return out_txt.strip().split() def tokenize(data,