Merge branch 'dev_gzf_funasr2' of github.com:alibaba-damo-academy/FunASR into dev_gzf_funasr2

add
2025-09-15 14:48:36 +08:00 · 2023-11-27 13:53:07 +08:00 · 2023-11-27 13:53:07 +08:00 · 3cee2214b7
commit 3cee2214b7
parent b5d3df75cf eed40f4f08
8 changed files with 67 additions and 57 deletions
--- a/71
+++ b/71
@ -1,3 +1,39 @@
+FunASR Model Open Source License
+Version 1.0
+
+Copyright (C) [2023-2028] Alibaba Group. All rights reserved.
+
+Thank you for choosing the FunASR open source models. The FunASR open source models contain a series of open-source models that allow everyone to use, modify, share, and learn from it.
+
+To ensure better community collaboration, we have developed the following agreement and hope that you carefully read and abide by it.
+
+1 Definitions
+In this agreement, [FunASR software] refers to the FunASR open source model, and its derivatives, including fine-tuned models. [You] refer to individuals or organizations who use, modify, share, and learn from [FunASR software].
+
+2 License and Restrictions
+
+2.1 License
+You are free to use, copy, modify, and share [FunASR software] under the conditions of this agreement.
+
+2.2 Restrictions
+You should indicate the code and model source and author information when using, copying, modifying and sharing [FunASR software]. You should keep the relevant names of models in [FunASR software].
+
+3 Responsibility and Risk
+[FunASR software] is for reference and learning purposes only and is not responsible for any direct or indirect losses caused by your use or modification of [FunASR software]. You should take responsibility and risks for your use and modification of [FunASR software].
+
+4 Termination
+If you violate any terms of this agreement, your license will be automatically terminated, and you must stop using, copying, modifying, and sharing [FunASR software].
+
+5 Revision
+This agreement may be updated and revised from time to time. The revised agreement will be published in the FunASR official repository and automatically take effect. If you continue to use, copy, modify, and share [FunASR software], it means you agree to the revised agreement.
+
+6 Other Provisions
+This agreement is subject to the laws of [Country/Region]. If any provisions are found to be illegal, invalid, or unenforceable, they shall be deemed deleted from this agreement, and the remaining provisions shall remain valid and binding.
+
+If you have any questions or comments about this agreement, please contact us.
+
+Copyright (c) [2023-2028] Alibaba Group. All rights reserved.
+
 FunASR 模型开源协议

 版本号：1.0
@ -36,38 +72,3 @@ FunASR 模型开源协议

 版权所有© [2023-2028] [阿里巴巴集团]。保留所有权利。

-FunASR Model Open Source License
-Version 1.0
-
-Copyright (C) [2023-2028] Alibaba Group. All rights reserved.
-
-Thank you for choosing the FunASR open source models. The FunASR open source models contain a series of open-source models that allow everyone to use, modify, share, and learn from it.
-
-To ensure better community collaboration, we have developed the following agreement and hope that you carefully read and abide by it.
-
-1 Definitions
-In this agreement, [FunASR software] refers to the FunASR open source model, and its derivatives, including fine-tuned models. [You] refer to individuals or organizations who use, modify, share, and learn from [FunASR software].
-
-2 License and Restrictions
-
-2.1 License
-You are free to use, copy, modify, and share [FunASR software] under the conditions of this agreement.
-
-2.2 Restrictions
-You should indicate the code and model source and author information when using, copying, modifying and sharing [FunASR software]. You should keep the relevant names of models in [FunASR software].
-
-3 Responsibility and Risk
-[FunASR software] is for reference and learning purposes only and is not responsible for any direct or indirect losses caused by your use or modification of [FunASR software]. You should take responsibility and risks for your use and modification of [FunASR software].
-
-4 Termination
-If you violate any terms of this agreement, your license will be automatically terminated, and you must stop using, copying, modifying, and sharing [FunASR software].
-
-5 Revision
-This agreement may be updated and revised from time to time. The revised agreement will be published in the FunASR official repository and automatically take effect. If you continue to use, copy, modify, and share [FunASR software], it means you agree to the revised agreement.
-
-6 Other Provisions
-This agreement is subject to the laws of [Country/Region]. If any provisions are found to be illegal, invalid, or unenforceable, they shall be deemed deleted from this agreement, and the remaining provisions shall remain valid and binding.
-
-If you have any questions or comments about this agreement, please contact us.
-
-Copyright (c) [2023-2028] Alibaba Group. All rights reserved.
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@ -956,24 +956,29 @@ def inference_paraformer_vad_speaker(
                ed = int(vadsegment[1]) / 1000
                vad_segments.append(
                    [st, ed, audio[int(st * 16000):int(ed * 16000)]])
-            check_audio_list(vad_segments)
-            # sv pipeline
-            segments = sv_chunk(vad_segments)
-            embeddings = []
-            for s in segments:
-                #_, embs = self.sv_pipeline([s[2]], output_emb=True)
-                # embeddings.append(embs)
-                wavs = sv_preprocess([s[2]])
-                # embs = self.forward(wavs)
-                embs = []
-                for x in wavs:
-                    x = extract_feature([x])
-                    embs.append(sv_model(x))
-                embs = torch.cat(embs)
-                embeddings.append(embs.detach().numpy())
-            embeddings = np.concatenate(embeddings)
-            labels = cb_model(embeddings)
-            sv_output = postprocess(segments, vad_segments, labels, embeddings)
+            audio_dur = check_audio_list(vad_segments)
+            if audio_dur > 5:
+                # sv pipeline
+                segments = sv_chunk(vad_segments)
+                embeddings = []
+                for s in segments:
+                    #_, embs = self.sv_pipeline([s[2]], output_emb=True)
+                    # embeddings.append(embs)
+                    wavs = sv_preprocess([s[2]])
+                    # embs = self.forward(wavs)
+                    embs = []
+                    for x in wavs:
+                        x = extract_feature([x])
+                        embs.append(sv_model(x))
+                    embs = torch.cat(embs)
+                    embeddings.append(embs.detach().numpy())
+                embeddings = np.concatenate(embeddings)
+                labels = cb_model(embeddings)
+                sv_output = postprocess(segments, vad_segments, labels, embeddings)
+            else:
+                # fake speaker res for too shot utterance
+                sv_output = [[0.0, vadsegments[-1][-1]/1000.0, 0]]
+                logging.warning("Too short utterence found: {}, return default speaker results.".format(keys))

            speech, speech_lengths = batch["speech"], batch["speech_lengths"]

--- a/funasr/models/e2e_asr_contextual_paraformer.py
+++ b/funasr/models/e2e_asr_contextual_paraformer.py
@ -134,7 +134,7 @@ class NeatContextualParaformer(Paraformer):
            text_lengths: torch.Tensor,
            hotword_pad: torch.Tensor,
            hotword_lengths: torch.Tensor,
-            ideal_attn: torch.Tensor,
+            dha_pad: torch.Tensor,
    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
        """Frontend + Encoder + Decoder + Calc loss

@ -207,7 +207,7 @@ class NeatContextualParaformer(Paraformer):
        # 2b. Attention decoder branch
        if self.ctc_weight != 1.0:
            loss_att, acc_att, cer_att, wer_att, loss_pre, loss_ideal = self._calc_att_clas_loss(
-                encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths, ideal_attn
+                encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths
            )

        # 3. CTC-Att loss definition
@ -242,7 +242,6 @@ class NeatContextualParaformer(Paraformer):
            ys_pad_lens: torch.Tensor,
            hotword_pad: torch.Tensor,
            hotword_lengths: torch.Tensor,
-            ideal_attn: torch.Tensor,
    ):
        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
            encoder_out.device)
--- a/funasr/utils/speaker_utils.py
+++ b/funasr/utils/speaker_utils.py
@ -35,7 +35,8 @@ def check_audio_list(audio: list):
            assert seg[0] >= audio[
                i - 1][1], 'modelscope error: Wrong time stamps.'
        audio_dur += seg[1] - seg[0]
-    assert audio_dur > 5, 'modelscope error: The effective audio duration is too short.'
+    return audio_dur
+    # assert audio_dur > 5, 'modelscope error: The effective audio duration is too short.'


 def sv_preprocess(inputs: Union[np.ndarray, list]):
--- a/runtime/onnxruntime/src/fsmn-vad-online.cpp
+++ b/runtime/onnxruntime/src/fsmn-vad-online.cpp
@ -110,6 +110,7 @@ int FsmnVadOnline::OnlineLfrCmvn(vector<vector<float>> &vad_feats, bool input_fi
                    p.insert(p.end(), vad_feats[vad_feats.size() - 1].begin(), vad_feats[vad_feats.size() - 1].end());
                }
                out_feats.emplace_back(p);
+                p.clear();
            } else {
                lfr_splice_frame_idxs = i;
                break;
--- a/runtime/onnxruntime/src/fsmn-vad.cpp
+++ b/runtime/onnxruntime/src/fsmn-vad.cpp
@ -264,6 +264,7 @@ void FsmnVad::LfrCmvn(std::vector<std::vector<float>> &vad_feats) {
                p.insert(p.end(), vad_feats[vad_feats.size() - 1].begin(), vad_feats[vad_feats.size() - 1].end());
            }
            out_feats.emplace_back(p);
+            p.clear();
        }
    }
    // Apply cmvn
--- a/runtime/onnxruntime/src/paraformer-online.cpp
+++ b/runtime/onnxruntime/src/paraformer-online.cpp
@ -164,6 +164,7 @@ int ParaformerOnline::OnlineLfrCmvn(vector<vector<float>> &wav_feats, bool input
                    p.insert(p.end(), wav_feats[wav_feats.size() - 1].begin(), wav_feats[wav_feats.size() - 1].end());
                }
                out_feats.emplace_back(p);
+                p.clear();
            } else {
                lfr_splice_frame_idxs = i;
                break;
--- a/runtime/onnxruntime/src/paraformer.cpp
+++ b/runtime/onnxruntime/src/paraformer.cpp
@ -436,6 +436,7 @@ void Paraformer::LfrCmvn(std::vector<std::vector<float>> &asr_feats) {
                p.insert(p.end(), asr_feats[asr_feats.size() - 1].begin(), asr_feats[asr_feats.size() - 1].end());
            }
            out_feats.emplace_back(p);
+            p.clear();
        }
    }
    // Apply cmvn