add en sv model

2025-09-15 14:48:36 +08:00 · 2023-03-09 12:16:58 +08:00 · 2023-03-09 12:16:58 +08:00 · 777ae05adb
commit 777ae05adb
parent 31dda90f2d
5 changed files with 171 additions and 8 deletions
--- a/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer.py
+++ b/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer.py
@ -0,0 +1,39 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+import numpy as np
+
+if __name__ == '__main__':
+    inference_sv_pipline = pipeline(
+        task=Tasks.speaker_verification,
+        model='damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch'
+    )
+
+    # extract speaker embedding
+    # for url use "spk_embedding" as key
+    rec_result = inference_sv_pipline(
+        audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav')
+    enroll = rec_result["spk_embedding"]
+
+    # for local file use "spk_embedding" as key
+    rec_result = inference_sv_pipline(audio_in='sv_example_same.wav')["test1"]
+    same = rec_result["spk_embedding"]
+
+    import soundfile
+    wav = soundfile.read('sv_example_enroll.wav')[0]
+    # for raw inputs use "spk_embedding" as key
+    spk_embedding = inference_sv_pipline(audio_in=wav)["spk_embedding"]
+
+    rec_result = inference_sv_pipline(
+        audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_different.wav')
+    different = rec_result["spk_embedding"]
+
+    # calculate cosine similarity for same speaker
+    sv_threshold = 0.9465
+    same_cos = np.sum(enroll * same) / (np.linalg.norm(enroll) * np.linalg.norm(same))
+    same_cos = max(same_cos - sv_threshold, 0.0) / (1.0 - sv_threshold) * 100.0
+    print("Similarity:", same_cos)
+
+    # calculate cosine similarity for different speaker
+    diff_cos = np.sum(enroll * different) / (np.linalg.norm(enroll) * np.linalg.norm(different))
+    diff_cos = max(diff_cos - sv_threshold, 0.0) / (1.0 - sv_threshold) * 100.0
+    print("Similarity:", diff_cos)
--- a/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer_sv.py
+++ b/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer_sv.py
@ -0,0 +1,21 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == '__main__':
+    inference_sv_pipline = pipeline(
+        task=Tasks.speaker_verification,
+        model='speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch'
+    )
+
+    # the same speaker
+    rec_result = inference_sv_pipline(audio_in=(
+        'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav',
+        'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_same.wav'))
+    print("Similarity", rec_result["scores"])
+
+    # different speakers
+    rec_result = inference_sv_pipline(audio_in=(
+        'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav',
+        'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_different.wav'))
+
+    print("Similarity", rec_result["scores"])
--- a/egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/infer.py
+++ b/egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/infer.py
@ -12,20 +12,20 @@ if __name__ == '__main__':
    # for url use "utt_id" as key
    rec_result = inference_sv_pipline(
        audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav')
-    enroll = rec_result["utt_id"]
+    enroll = rec_result["spk_embedding"]

    # for local file use "utt_id" as key
    rec_result = inference_sv_pipline(audio_in='sv_example_same.wav')["test1"]
-    same = rec_result["test1"]
+    same = rec_result["spk_embedding"]

    import soundfile
    wav = soundfile.read('sv_example_enroll.wav')[0]
    # for raw inputs use "utt_id" as key
-    spk_embedding = inference_sv_pipline(audio_in=wav)["utt_id"]
+    spk_embedding = inference_sv_pipline(audio_in=wav)["spk_embedding"]

    rec_result = inference_sv_pipline(
        audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_different.wav')
-    different = rec_result["utt_id"]
+    different = rec_result["spk_embedding"]

    # 对相同的说话人计算余弦相似度
    sv_threshold = 0.9465
--- a/funasr/models/encoder/resnet34_encoder.py
+++ b/funasr/models/encoder/resnet34_encoder.py
@ -387,7 +387,6 @@ class ResNet34_SP_L2Reg(AbsEncoder):
        return var_dict_torch_update


-
 class ResNet34Diar(ResNet34):
    def __init__(
            self,
--- a/funasr/tasks/sv.py
+++ b/funasr/tasks/sv.py
@ -1,14 +1,18 @@
 import argparse
 import logging
+import os
+from pathlib import Path
 from typing import Callable
 from typing import Collection
 from typing import Dict
 from typing import List
 from typing import Optional
 from typing import Tuple
+from typing import Union

 import numpy as np
 import torch
+import yaml
 from typeguard import check_argument_types
 from typeguard import check_return_type

@ -21,7 +25,7 @@ from funasr.models.e2e_asr import ESPnetASRModel
 from funasr.models.decoder.abs_decoder import AbsDecoder
 from funasr.models.encoder.abs_encoder import AbsEncoder
 from funasr.models.encoder.rnn_encoder import RNNEncoder
-from funasr.models.encoder.resnet34_encoder import ResNet34
+from funasr.models.encoder.resnet34_encoder import ResNet34, ResNet34_SP_L2Reg
 from funasr.models.pooling.statistic_pooling import StatisticPooling
 from funasr.models.decoder.sv_decoder import DenseDecoder
 from funasr.models.e2e_sv import ESPnetSVModel
@ -103,6 +107,7 @@ encoder_choices = ClassChoices(
    "encoder",
    classes=dict(
        resnet34=ResNet34,
+        resnet34_sp_l2reg=ResNet34_SP_L2Reg,
        rnn=RNNEncoder,
    ),
    type_check=AbsEncoder,
@ -394,9 +399,16 @@ class SVTask(AbsTask):

        # 7. Pooling layer
        pooling_class = pooling_choices.get_class(args.pooling_type)
+        pooling_dim = (2, 3)
+        eps = 1e-12
+        if hasattr(args, "pooling_type_conf"):
+            if "pooling_dim" in args.pooling_type_conf:
+                pooling_dim = args.pooling_type_conf["pooling_dim"]
+            if "eps" in args.pooling_type_conf:
+                eps = args.pooling_type_conf["eps"]
        pooling_layer = pooling_class(
-            pooling_dim=(2, 3),
-            eps=1e-12,
+            pooling_dim=pooling_dim,
+            eps=eps,
        )
        if args.pooling_type == "statistic":
            encoder_output_size *= 2
@ -435,3 +447,95 @@ class SVTask(AbsTask):

        assert check_return_type(model)
        return model
+
+    # ~~~~~~~~~ The methods below are mainly used for inference ~~~~~~~~~
+    @classmethod
+    def build_model_from_file(
+            cls,
+            config_file: Union[Path, str] = None,
+            model_file: Union[Path, str] = None,
+            cmvn_file: Union[Path, str] = None,
+            device: str = "cpu",
+    ):
+        """Build model from the files.
+
+        This method is used for inference or fine-tuning.
+
+        Args:
+            config_file: The yaml file saved when training.
+            model_file: The model file saved when training.
+            cmvn_file: The cmvn file for front-end
+            device: Device type, "cpu", "cuda", or "cuda:N".
+
+        """
+        assert check_argument_types()
+        if config_file is None:
+            assert model_file is not None, (
+                "The argument 'model_file' must be provided "
+                "if the argument 'config_file' is not specified."
+            )
+            config_file = Path(model_file).parent / "config.yaml"
+        else:
+            config_file = Path(config_file)
+
+        with config_file.open("r", encoding="utf-8") as f:
+            args = yaml.safe_load(f)
+        if cmvn_file is not None:
+            args["cmvn_file"] = cmvn_file
+        args = argparse.Namespace(**args)
+        model = cls.build_model(args)
+        if not isinstance(model, AbsESPnetModel):
+            raise RuntimeError(
+                f"model must inherit {AbsESPnetModel.__name__}, but got {type(model)}"
+            )
+        model.to(device)
+        model_dict = dict()
+        model_name_pth = None
+        if model_file is not None:
+            logging.info("model_file is {}".format(model_file))
+            if device == "cuda":
+                device = f"cuda:{torch.cuda.current_device()}"
+            model_dir = os.path.dirname(model_file)
+            model_name = os.path.basename(model_file)
+            if "model.ckpt-" in model_name or ".bin" in model_name:
+                if ".bin" in model_name:
+                    model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
+                else:
+                    model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name))
+                if os.path.exists(model_name_pth):
+                    logging.info("model_file is load from pth: {}".format(model_name_pth))
+                    model_dict = torch.load(model_name_pth, map_location=device)
+                else:
+                    model_dict = cls.convert_tf2torch(model, model_file)
+                model.load_state_dict(model_dict)
+            else:
+                model_dict = torch.load(model_file, map_location=device)
+        model.load_state_dict(model_dict)
+        if model_name_pth is not None and not os.path.exists(model_name_pth):
+            torch.save(model_dict, model_name_pth)
+            logging.info("model_file is saved to pth: {}".format(model_name_pth))
+
+        return model, args
+
+    @classmethod
+    def convert_tf2torch(
+            cls,
+            model,
+            ckpt,
+    ):
+        logging.info("start convert tf model to torch model")
+        from funasr.modules.streaming_utils.load_fr_tf import load_tf_dict
+        var_dict_tf = load_tf_dict(ckpt)
+        var_dict_torch = model.state_dict()
+        var_dict_torch_update = dict()
+        # speech encoder
+        var_dict_torch_update_local = model.encoder.convert_tf2torch(var_dict_tf, var_dict_torch)
+        var_dict_torch_update.update(var_dict_torch_update_local)
+        # pooling layer
+        var_dict_torch_update_local = model.pooling_layer.convert_tf2torch(var_dict_tf, var_dict_torch)
+        var_dict_torch_update.update(var_dict_torch_update_local)
+        # decoder
+        var_dict_torch_update_local = model.decoder.convert_tf2torch(var_dict_tf, var_dict_torch)
+        var_dict_torch_update.update(var_dict_torch_update_local)
+
+        return var_dict_torch_update