diff --git a/examples/industrial_data_pretraining/emotion2vec/demo.py b/examples/industrial_data_pretraining/emotion2vec/demo.py index b274bd966..940b9f932 100644 --- a/examples/industrial_data_pretraining/emotion2vec/demo.py +++ b/examples/industrial_data_pretraining/emotion2vec/demo.py @@ -9,7 +9,7 @@ from funasr import AutoModel model = AutoModel(model="iic/emotion2vec_base_finetuned", model_revision="v2.0.4", # vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch", # vad_model_revision="v2.0.4", - # vad_kwargs={"max_single_segment_time": 10}, + # vad_kwargs={"max_single_segment_time": 1000}, ) wav_file = f"{model.model_path}/example/test.wav" diff --git a/examples/industrial_data_pretraining/paraformer/demo.py b/examples/industrial_data_pretraining/paraformer/demo.py index 499791fa2..a39a52673 100644 --- a/examples/industrial_data_pretraining/paraformer/demo.py +++ b/examples/industrial_data_pretraining/paraformer/demo.py @@ -9,7 +9,7 @@ model = AutoModel(model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k- model_revision="v2.0.4", vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch", vad_model_revision="v2.0.4", - vad_kwargs={"max_single_segment_time": 60}, + vad_kwargs={"max_single_segment_time": 60000}, punc_model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch", punc_model_revision="v2.0.4", # spk_model="iic/speech_campplus_sv_zh-cn_16k-common", diff --git a/examples/industrial_data_pretraining/whisper/demo.py b/examples/industrial_data_pretraining/whisper/demo.py index a073f68f3..e1e1aad29 100644 --- a/examples/industrial_data_pretraining/whisper/demo.py +++ b/examples/industrial_data_pretraining/whisper/demo.py @@ -10,7 +10,7 @@ from funasr import AutoModel model = AutoModel(model="iic/Whisper-large-v3", model_revision="v2.0.5", vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch", - vad_kwargs={"max_single_segment_time": 30}, + vad_kwargs={"max_single_segment_time": 30000}, ) res = model.generate( diff --git a/examples/industrial_data_pretraining/whisper/demo_from_openai.py b/examples/industrial_data_pretraining/whisper/demo_from_openai.py index 9a2276445..a257bc820 100644 --- a/examples/industrial_data_pretraining/whisper/demo_from_openai.py +++ b/examples/industrial_data_pretraining/whisper/demo_from_openai.py @@ -12,7 +12,7 @@ from funasr import AutoModel # model = AutoModel(model="Whisper-large-v2", hub="openai") model = AutoModel(model="Whisper-large-v3", vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch", - vad_kwargs={"max_single_segment_time": 30}, + vad_kwargs={"max_single_segment_time": 30000}, hub="openai", ) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index bba44e792..bd786d0b6 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -312,7 +312,8 @@ class AutoModel: key = res[i]["key"] vadsegments = res[i]["value"] input_i = data_list[i] - speech = load_audio_text_image_video(input_i, fs=kwargs["frontend"].fs, audio_fs=kwargs.get("fs", 16000)) + fs = kwargs["frontend"].fs if hasattr(kwargs["frontend"], "fs") else 16000 + speech = load_audio_text_image_video(input_i, fs=fs, audio_fs=kwargs.get("fs", 16000)) speech_lengths = len(speech) n = len(vadsegments) data_with_index = [(vadsegments[i], i) for i in range(n)]