From 8b74979bd38c70c731e04b32308f2c9edeabdc37 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= Date: Tue, 16 Jul 2024 13:57:51 +0800 Subject: [PATCH] sensevoice --- .../sense_voice/demo.py | 72 ++++++++++++++++--- funasr/auto/auto_model.py | 7 +- 2 files changed, 70 insertions(+), 9 deletions(-) diff --git a/examples/industrial_data_pretraining/sense_voice/demo.py b/examples/industrial_data_pretraining/sense_voice/demo.py index 1800d7591..f10e54b45 100644 --- a/examples/industrial_data_pretraining/sense_voice/demo.py +++ b/examples/industrial_data_pretraining/sense_voice/demo.py @@ -3,26 +3,82 @@ # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. # MIT License (https://opensource.org/licenses/MIT) -import sys + from funasr import AutoModel from funasr.utils.postprocess_utils import rich_transcription_postprocess model_dir = "iic/SenseVoiceSmall" -input_file = ( - "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" -) + model = AutoModel( model=model_dir, + vad_model="fsmn-vad", + vad_kwargs={"max_single_segment_time": 30000}, + device="cpu", ) +# en res = model.generate( - input=input_file, + input=f"{model.model_path}/example/en.mp3", cache={}, language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" - use_itn=False, + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, +) +text = rich_transcription_postprocess(res[0]["text"]) +print(text) + +# zh +res = model.generate( + input=f"{model.model_path}/example/zh.mp3", + cache={}, + language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, +) +text = rich_transcription_postprocess(res[0]["text"]) +print(text) + +# yue +res = model.generate( + input=f"{model.model_path}/example/yue.mp3", + cache={}, + language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, +) +text = rich_transcription_postprocess(res[0]["text"]) +print(text) + +# ja +res = model.generate( + input=f"{model.model_path}/example/ja.mp3", + cache={}, + language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, +) +text = rich_transcription_postprocess(res[0]["text"]) +print(text) + + +# ko +res = model.generate( + input=f"{model.model_path}/example/ko.mp3", + cache={}, + language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, ) - text = rich_transcription_postprocess(res[0]["text"]) - print(text) diff --git a/funasr/auto/auto_model.py b/funasr/auto/auto_model.py index 1c2d2e48f..a82f6ed75 100644 --- a/funasr/auto/auto_model.py +++ b/funasr/auto/auto_model.py @@ -339,7 +339,9 @@ class AutoModel: # FIX(gcf): concat the vad clips for sense vocie model for better aed if kwargs.get("merge_vad", False): for i in range(len(res)): - res[i]["value"] = merge_vad(res[i]["value"], kwargs.get("merge_length", 15000)) + res[i]["value"] = merge_vad( + res[i]["value"], kwargs.get("merge_length_s", 15) * 1000 + ) # step.2 compute asr model model = self.model @@ -380,6 +382,9 @@ class AutoModel: if len(sorted_data) > 0 and len(sorted_data[0]) > 0: batch_size = max(batch_size, sorted_data[0][0][1] - sorted_data[0][0][0]) + if kwargs["device"] == "cpu": + batch_size = 0 + beg_idx = 0 beg_asr_total = time.time() time_speech_total_per_sample = speech_lengths / 16000