mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
bug fix
This commit is contained in:
parent
9a9c3b75b5
commit
7458e39ff0
@ -11,6 +11,7 @@ res = model.generate(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/Ma
|
||||
print(res)
|
||||
|
||||
|
||||
''' can not use currently
|
||||
from funasr import AutoFrontend
|
||||
|
||||
frontend = AutoFrontend(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", model_revision="v2.0.2")
|
||||
@ -20,3 +21,4 @@ fbanks = frontend(input="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/
|
||||
for batch_idx, fbank_dict in enumerate(fbanks):
|
||||
res = model.generate(**fbank_dict)
|
||||
print(res)
|
||||
'''
|
||||
@ -235,23 +235,23 @@ class BiCifParaformer(Paraformer):
|
||||
self.nbest = kwargs.get("nbest", 1)
|
||||
|
||||
meta_data = {}
|
||||
if isinstance(data_in, torch.Tensor): # fbank
|
||||
speech, speech_lengths = data_in, data_lengths
|
||||
if len(speech.shape) < 3:
|
||||
speech = speech[None, :, :]
|
||||
if speech_lengths is None:
|
||||
speech_lengths = speech.shape[1]
|
||||
else:
|
||||
# extract fbank feats
|
||||
time1 = time.perf_counter()
|
||||
audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
|
||||
time2 = time.perf_counter()
|
||||
meta_data["load_data"] = f"{time2 - time1:0.3f}"
|
||||
speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
|
||||
frontend=frontend)
|
||||
time3 = time.perf_counter()
|
||||
meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
|
||||
meta_data["batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
|
||||
# if isinstance(data_in, torch.Tensor): # fbank
|
||||
# speech, speech_lengths = data_in, data_lengths
|
||||
# if len(speech.shape) < 3:
|
||||
# speech = speech[None, :, :]
|
||||
# if speech_lengths is None:
|
||||
# speech_lengths = speech.shape[1]
|
||||
# else:
|
||||
# extract fbank feats
|
||||
time1 = time.perf_counter()
|
||||
audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000))
|
||||
time2 = time.perf_counter()
|
||||
meta_data["load_data"] = f"{time2 - time1:0.3f}"
|
||||
speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
|
||||
frontend=frontend)
|
||||
time3 = time.perf_counter()
|
||||
meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
|
||||
meta_data["batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000
|
||||
|
||||
speech = speech.to(device=kwargs["device"])
|
||||
speech_lengths = speech_lengths.to(device=kwargs["device"])
|
||||
|
||||
Loading…
Reference in New Issue
Block a user