diff --git a/funasr/bin/asr_inference_mfcca.py b/funasr/bin/asr_inference_mfcca.py index 888d4d2f8..6f3dbb113 100644 --- a/funasr/bin/asr_inference_mfcca.py +++ b/funasr/bin/asr_inference_mfcca.py @@ -41,8 +41,6 @@ from funasr.utils.types import str_or_none from funasr.utils import asr_utils, wav_utils, postprocess_utils import pdb -header_colors = '\033[95m' -end_colors = '\033[0m' global_asr_language: str = 'zh-cn' global_sample_rate: Union[int, Dict[Any, int]] = { diff --git a/funasr/datasets/preprocessor.py b/funasr/datasets/preprocessor.py index 20a3791de..98cca1dcd 100644 --- a/funasr/datasets/preprocessor.py +++ b/funasr/datasets/preprocessor.py @@ -47,15 +47,11 @@ def forward_segment(text, dic): def seg_tokenize(txt, seg_dict): out_txt = "" - pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])") for word in txt: - if pattern.match(word): - if word in seg_dict: - out_txt += seg_dict[word] + " " - else: - out_txt += "" + " " + if word in seg_dict: + out_txt += seg_dict[word] + " " else: - continue + out_txt += "" + " " return out_txt.strip().split() def seg_tokenize_wo_pattern(txt, seg_dict): diff --git a/tests/test_asr_inference_pipeline.py b/tests/test_asr_inference_pipeline.py index 32b8af5ec..b3c5a2467 100644 --- a/tests/test_asr_inference_pipeline.py +++ b/tests/test_asr_inference_pipeline.py @@ -452,7 +452,7 @@ class TestUniasrInferencePipelines(unittest.TestCase): def test_uniasr_2pass_zhcn_16k_common_vocab8358_offline(self): inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, - model='damo/speech_UniASauto_speech_recognitionR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline') + model='damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline') rec_result = inference_pipeline( audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav', param_dict={"decoding_model": "offline"})