diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py index 4aae8e970..bf5590c9b 100644 --- a/funasr/bin/asr_inference_paraformer_streaming.py +++ b/funasr/bin/asr_inference_paraformer_streaming.py @@ -205,9 +205,12 @@ class Speech2Text: results = [] cache_en = cache["encoder"] if speech.shape[1] < 16 * 60 and cache_en["is_final"]: + if cache_en["start_idx"] == 0: + return [] cache_en["tail_chunk"] = True feats = cache_en["feats"] feats_len = torch.tensor([feats.shape[1]]) + self.asr_model.frontend = None results = self.infer(feats, feats_len, cache) return results else: diff --git a/funasr/models/encoder/sanm_encoder.py b/funasr/models/encoder/sanm_encoder.py index 969ddadf2..2a680114e 100644 --- a/funasr/models/encoder/sanm_encoder.py +++ b/funasr/models/encoder/sanm_encoder.py @@ -380,7 +380,7 @@ class SANMEncoder(AbsEncoder): else: xs_pad = self.embed(xs_pad, cache) if cache["tail_chunk"]: - xs_pad = cache["feats"] + xs_pad = to_device(cache["feats"], device=xs_pad.device) else: xs_pad = self._add_overlap_chunk(xs_pad, cache) encoder_outs = self.encoders0(xs_pad, None, None, None, None)