diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py index bc511bbd8..0508a4c36 100644 --- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py +++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py @@ -1,39 +1,12 @@ -import os -import logging -import torch -import soundfile - from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks -from modelscope.utils.logger import get_logger -logger = get_logger(log_level=logging.CRITICAL) -logger.setLevel(logging.CRITICAL) - -os.environ["MODELSCOPE_CACHE"] = "./" inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online', - model_revision='v1.0.4' + model_revision='v1.0.5', + mode="paraformer_fake_streaming" ) - -model_dir = os.path.join(os.environ["MODELSCOPE_CACHE"], "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online") -speech, sample_rate = soundfile.read(os.path.join(model_dir, "example/asr_example.wav")) -speech_length = speech.shape[0] - -sample_offset = 0 -chunk_size = [5, 10, 5] #[5, 10, 5] 600ms, [8, 8, 4] 480ms -stride_size = chunk_size[1] * 960 -param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size} -final_result = "" - -for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)): - if sample_offset + stride_size >= speech_length - 1: - stride_size = speech_length - sample_offset - param_dict["is_final"] = True - rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + stride_size], - param_dict=param_dict) - if len(rec_result) != 0: - final_result += rec_result['text'] - print(rec_result) -print(final_result) +audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav' +rec_result = inference_pipeline(audio_in=audio_in) +print(rec_result) diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py new file mode 100644 index 000000000..73d6b3547 --- /dev/null +++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py @@ -0,0 +1,40 @@ +import os +import logging +import torch +import soundfile + +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks +from modelscope.utils.logger import get_logger + +logger = get_logger(log_level=logging.CRITICAL) +logger.setLevel(logging.CRITICAL) + +os.environ["MODELSCOPE_CACHE"] = "./" +inference_pipeline = pipeline( + task=Tasks.auto_speech_recognition, + model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online', + model_revision='v1.0.4', + mode="paraformer_streaming" +) + +model_dir = os.path.join(os.environ["MODELSCOPE_CACHE"], "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online") +speech, sample_rate = soundfile.read(os.path.join(model_dir, "example/asr_example.wav")) +speech_length = speech.shape[0] + +sample_offset = 0 +chunk_size = [5, 10, 5] #[5, 10, 5] 600ms, [8, 8, 4] 480ms +stride_size = chunk_size[1] * 960 +param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size} +final_result = "" + +for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)): + if sample_offset + stride_size >= speech_length - 1: + stride_size = speech_length - sample_offset + param_dict["is_final"] = True + rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + stride_size], + param_dict=param_dict) + if len(rec_result) != 0: + final_result += rec_result['text'] + print(rec_result) +print(final_result) diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py index f5296f678..b5a222502 100644 --- a/funasr/bin/asr_inference_launch.py +++ b/funasr/bin/asr_inference_launch.py @@ -1618,7 +1618,7 @@ def inference_launch(**kwargs): return inference_uniasr(**kwargs) elif mode == "paraformer": return inference_paraformer(**kwargs) - elif mode == "paraformer_online": + elif mode == "paraformer_fake_streaming": return inference_paraformer(**kwargs) elif mode == "paraformer_streaming": return inference_paraformer_online(**kwargs) diff --git a/funasr/models/encoder/sanm_encoder.py b/funasr/models/encoder/sanm_encoder.py index da675864c..46eabd1eb 100644 --- a/funasr/models/encoder/sanm_encoder.py +++ b/funasr/models/encoder/sanm_encoder.py @@ -633,6 +633,8 @@ class SANMEncoderChunkOpt(AbsEncoder): self.embed = torch.nn.Linear(input_size, output_size) elif input_layer == "pe": self.embed = SinusoidalPositionEncoder() + elif input_layer == "pe_online": + self.embed = StreamSinusoidalPositionEncoder() else: raise ValueError("unknown input_layer: " + input_layer) self.normalize_before = normalize_before @@ -818,6 +820,59 @@ class SANMEncoderChunkOpt(AbsEncoder): return (xs_pad, intermediate_outs), olens, None return xs_pad, olens, None + def _add_overlap_chunk(self, feats: np.ndarray, cache: dict = {}): + if len(cache) == 0: + return feats + cache["feats"] = to_device(cache["feats"], device=feats.device) + overlap_feats = torch.cat((cache["feats"], feats), dim=1) + cache["feats"] = overlap_feats[:, -(cache["chunk_size"][0] + cache["chunk_size"][2]):, :] + return overlap_feats + + def forward_chunk(self, + xs_pad: torch.Tensor, + ilens: torch.Tensor, + cache: dict = None, + ctc: CTC = None, + ): + xs_pad *= self.output_size() ** 0.5 + if self.embed is None: + xs_pad = xs_pad + else: + xs_pad = self.embed(xs_pad, cache) + if cache["tail_chunk"]: + xs_pad = to_device(cache["feats"], device=xs_pad.device) + else: + xs_pad = self._add_overlap_chunk(xs_pad, cache) + encoder_outs = self.encoders0(xs_pad, None, None, None, None) + xs_pad, masks = encoder_outs[0], encoder_outs[1] + intermediate_outs = [] + if len(self.interctc_layer_idx) == 0: + encoder_outs = self.encoders(xs_pad, None, None, None, None) + xs_pad, masks = encoder_outs[0], encoder_outs[1] + else: + for layer_idx, encoder_layer in enumerate(self.encoders): + encoder_outs = encoder_layer(xs_pad, None, None, None, None) + xs_pad, masks = encoder_outs[0], encoder_outs[1] + if layer_idx + 1 in self.interctc_layer_idx: + encoder_out = xs_pad + + # intermediate outputs are also normalized + if self.normalize_before: + encoder_out = self.after_norm(encoder_out) + + intermediate_outs.append((layer_idx + 1, encoder_out)) + + if self.interctc_use_conditioning: + ctc_out = ctc.softmax(encoder_out) + xs_pad = xs_pad + self.conditioning_layer(ctc_out) + + if self.normalize_before: + xs_pad = self.after_norm(xs_pad) + + if len(intermediate_outs) > 0: + return (xs_pad, intermediate_outs), None, None + return xs_pad, ilens, None + def gen_tf2torch_map_dict(self): tensor_name_prefix_torch = self.tf2torch_tensor_name_prefix_torch tensor_name_prefix_tf = self.tf2torch_tensor_name_prefix_tf