From ffd4aac427ae2fece25bc2434da5f3f0fc8da0fc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E4=BB=81=E8=BF=B7?= Date: Fri, 17 Mar 2023 23:47:07 +0800 Subject: [PATCH] update paraformer streaming recipe --- .../infer.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py index 3dfb91228..c1c541ba8 100644 --- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py +++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py @@ -13,7 +13,7 @@ inference_pipeline = pipeline( model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online', model_revision='v1.0.2') -waveform, sample_rate = torchaudio.load("asr_example_zh.wav") +waveform, sample_rate = torchaudio.load("waihu.wav") speech_length = waveform.shape[1] speech = waveform[0] @@ -28,7 +28,7 @@ speech_buffer = speech speech_cache = [] final_result = "" -while len(speech_buffer) > 0: +while len(speech_buffer) >= 960: if first_chunk: if len(speech_buffer) >= 14400: rec_result = inference_pipeline(audio_in=speech_buffer[0:14400], param_dict=param_dict) @@ -47,11 +47,11 @@ while len(speech_buffer) > 0: rec_result = inference_pipeline(audio_in=speech_buffer[:19200], param_dict=param_dict) speech_buffer = speech_buffer[9600:] else: - cache_en["stride"] = len(speech_buffer) // 960 + cache_en["stride"] = len(speech_buffer) // 960 cache_en["pad_right"] = 0 rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict) speech_buffer = [] - if rec_result['text'] != "sil": + if len(rec_result) !=0 and rec_result['text'] != "sil": final_result += rec_result['text'] print(rec_result) print(final_result)