Merge branch 'dev_infer' of https://github.com/alibaba/FunASR into dev_infer

2025-09-15 14:48:36 +08:00 · 2023-05-17 15:16:06 +08:00 · 2023-05-17 15:16:06 +08:00 · e1ba6bc138
commit e1ba6bc138
parent d1374e9c80 3be66ec19a
6 changed files with 14 additions and 13 deletions
--- a/egs_modelscope/speaker_diarization/speech_diarization_sond-en-us-callhome-8k-n16k4-pytorch/infer.py
+++ b/egs_modelscope/speaker_diarization/speech_diarization_sond-en-us-callhome-8k-n16k4-pytorch/infer.py
@ -7,8 +7,9 @@ https://arxiv.org/abs/2303.05397
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks

-# 初始化推理 pipeline
-# 当以原始音频作为输入时使用配置文件 sond.yaml，并设置 mode 为sond_demo
+# initialize the pipeline for inference
+# when using the raw waveform files to inference, please use the config file `sond.yaml`
+# and set mode to `sond_demo`
 inference_diar_pipline = pipeline(
    mode="sond_demo",
    num_workers=0,
@ -19,7 +20,8 @@ inference_diar_pipline = pipeline(
    sv_model_revision="master",
 )

-# 以 audio_list 作为输入，其中第一个音频为待检测语音，后面的音频为不同说话人的声纹注册语音
+# use audio_list as the input, where the first one is the record to be detected
+# and the following files are enrollments for different speakers
 audio_list = [
    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav",
    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_A.wav",
--- a/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/infer.py
+++ b/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/infer.py
@ -7,8 +7,9 @@ https://arxiv.org/abs/2211.10243
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks

-# 初始化推理 pipeline
-# 当以原始音频作为输入时使用配置文件 sond.yaml，并设置 mode 为sond_demo
+# initialize the pipeline for inference
+# when using the raw waveform files to inference, please use the config file `sond.yaml`
+# and set mode to `sond_demo`
 inference_diar_pipline = pipeline(
    mode="sond_demo",
    num_workers=0,
@ -19,7 +20,8 @@ inference_diar_pipline = pipeline(
    sv_model_revision="master",
 )

-# 以 audio_list 作为输入，其中第一个音频为待检测语音，后面的音频为不同说话人的声纹注册语音
+# use audio_list as the input, where the first one is the record to be detected
+# and the following files are enrollments for different speakers
 audio_list = [
    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/record.wav",
    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk1.wav",
--- a/egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/infer_sv.py
+++ b/egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/infer_sv.py
@ -7,13 +7,13 @@ if __name__ == '__main__':
        model='damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch'
    )

-    # 两个语音为相同说话人
+    # the same speaker
    rec_result = inference_sv_pipline(audio_in=(
        'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav',
        'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_same.wav'))
    print("Similarity", rec_result["scores"])

-    # 两个语音为不同说话人
+    # different speaker
    rec_result = inference_sv_pipline(audio_in=(
        'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav',
        'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_different.wav'))
--- a/funasr/bin/diar_inference_launch.py
+++ b/funasr/bin/diar_inference_launch.py
@ -38,7 +38,6 @@ from typeguard import check_return_type
 from scipy.signal import medfilt
 from funasr.utils.cli_utils import get_commandline_args
 from funasr.tasks.diar import DiarTask
-from funasr.tasks.asr import ASRTask
 from funasr.tasks.diar import EENDOLADiarTask
 from funasr.torch_utils.device_funcs import to_device
 from funasr.torch_utils.set_all_random_seed import set_all_random_seed
@ -187,7 +186,7 @@ def inference_sond(
                raise TypeError("raw_inputs must be a list or tuple in [speech, profile1, profile2, ...] ")
        else:
            # 3. Build data-iterator
-            loader = ASRTask.build_streaming_iterator(
+            loader = DiarTask.build_streaming_iterator(
                data_path_and_name_and_type,
                dtype=dtype,
                batch_size=batch_size,
--- a/funasr/bin/sv_infer.py
+++ b/funasr/bin/sv_infer.py
@ -23,7 +23,6 @@ from typeguard import check_return_type

 from funasr.utils.cli_utils import get_commandline_args
 from funasr.tasks.sv import SVTask
-from funasr.tasks.asr import ASRTask
 from funasr.torch_utils.device_funcs import to_device
 from funasr.torch_utils.set_all_random_seed import set_all_random_seed
 from funasr.utils import config_argparse
--- a/funasr/bin/sv_inference_launch.py
+++ b/funasr/bin/sv_inference_launch.py
@ -34,7 +34,6 @@ from typeguard import check_return_type

 from funasr.utils.cli_utils import get_commandline_args
 from funasr.tasks.sv import SVTask
-from funasr.tasks.asr import ASRTask
 from funasr.torch_utils.device_funcs import to_device
 from funasr.torch_utils.set_all_random_seed import set_all_random_seed
 from funasr.utils import config_argparse
@ -115,7 +114,7 @@ def inference_sv(
            data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
        
        # 3. Build data-iterator
-        loader = ASRTask.build_streaming_iterator(
+        loader = SVTask.build_streaming_iterator(
            data_path_and_name_and_type,
            dtype=dtype,
            batch_size=batch_size,