diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py index 808084fd5..b56645413 100644 --- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py +++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py @@ -34,6 +34,6 @@ for sample_offset in range(0, speech_length, min(stride_size, speech_length - sa rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + stride_size], param_dict=param_dict) if len(rec_result) != 0: - final_result += " ".join(rec_result['text']) + " " + final_result += rec_result['text'] + " " print(rec_result) print(final_result) diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py index 0ecf1ab39..6672bbf78 100644 --- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py +++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py @@ -34,6 +34,6 @@ for sample_offset in range(0, speech_length, min(stride_size, speech_length - sa rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + stride_size], param_dict=param_dict) if len(rec_result) != 0: - final_result += " ".join(rec_result['text']) + " " + final_result += rec_result['text'] + " " print(rec_result) print(final_result.strip()) diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py index be0d752a1..4f04d02e3 100644 --- a/funasr/bin/asr_inference_paraformer_streaming.py +++ b/funasr/bin/asr_inference_paraformer_streaming.py @@ -553,12 +553,12 @@ def inference_modelscope( asr_result = speech2text(cache, raw_inputs[:, sample_offset: sample_offset + stride_size], input_lens) if len(asr_result) != 0: final_result += " ".join(asr_result) + " " - item = {'key': "utt", 'value': [final_result.strip()]} + item = {'key': "utt", 'value': final_result.strip()} else: input_lens = torch.tensor([raw_inputs.shape[1]]) cache["encoder"]["is_final"] = is_final asr_result = speech2text(cache, raw_inputs, input_lens) - item = {'key': "utt", 'value': asr_result} + item = {'key': "utt", 'value': " ".join(asr_result)} asr_result_list.append(item) if is_final: