mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
vad conf
This commit is contained in:
parent
60a7d39f39
commit
d29f201e32
@ -9,7 +9,7 @@ from funasr import AutoModel
|
||||
model = AutoModel(model="iic/emotion2vec_base_finetuned", model_revision="v2.0.4",
|
||||
# vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
||||
# vad_model_revision="v2.0.4",
|
||||
# vad_kwargs={"max_single_segment_time": 10},
|
||||
# vad_kwargs={"max_single_segment_time": 1000},
|
||||
)
|
||||
|
||||
wav_file = f"{model.model_path}/example/test.wav"
|
||||
|
||||
@ -9,7 +9,7 @@ model = AutoModel(model="iic/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-
|
||||
model_revision="v2.0.4",
|
||||
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
||||
vad_model_revision="v2.0.4",
|
||||
vad_kwargs={"max_single_segment_time": 60},
|
||||
vad_kwargs={"max_single_segment_time": 60000},
|
||||
punc_model="iic/punc_ct-transformer_zh-cn-common-vocab272727-pytorch",
|
||||
punc_model_revision="v2.0.4",
|
||||
# spk_model="iic/speech_campplus_sv_zh-cn_16k-common",
|
||||
|
||||
@ -10,7 +10,7 @@ from funasr import AutoModel
|
||||
model = AutoModel(model="iic/Whisper-large-v3",
|
||||
model_revision="v2.0.5",
|
||||
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
||||
vad_kwargs={"max_single_segment_time": 30},
|
||||
vad_kwargs={"max_single_segment_time": 30000},
|
||||
)
|
||||
|
||||
res = model.generate(
|
||||
|
||||
@ -12,7 +12,7 @@ from funasr import AutoModel
|
||||
# model = AutoModel(model="Whisper-large-v2", hub="openai")
|
||||
model = AutoModel(model="Whisper-large-v3",
|
||||
vad_model="iic/speech_fsmn_vad_zh-cn-16k-common-pytorch",
|
||||
vad_kwargs={"max_single_segment_time": 30},
|
||||
vad_kwargs={"max_single_segment_time": 30000},
|
||||
hub="openai",
|
||||
)
|
||||
|
||||
|
||||
@ -312,7 +312,8 @@ class AutoModel:
|
||||
key = res[i]["key"]
|
||||
vadsegments = res[i]["value"]
|
||||
input_i = data_list[i]
|
||||
speech = load_audio_text_image_video(input_i, fs=kwargs["frontend"].fs, audio_fs=kwargs.get("fs", 16000))
|
||||
fs = kwargs["frontend"].fs if hasattr(kwargs["frontend"], "fs") else 16000
|
||||
speech = load_audio_text_image_video(input_i, fs=fs, audio_fs=kwargs.get("fs", 16000))
|
||||
speech_lengths = len(speech)
|
||||
n = len(vadsegments)
|
||||
data_with_index = [(vadsegments[i], i) for i in range(n)]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user