From a694d92d37f63c800291c4c9e5b64054afa7278c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= Date: Tue, 16 Jul 2024 13:59:29 +0800 Subject: [PATCH] sensevoice --- README.md | 35 +++++++++--------- README_zh.md | 99 +++++++++++++++++++++++++++----------------------- demo_funasr.py | 74 ++++++++++++++++++++++++++++++++----- finetune.sh | 2 +- 4 files changed, 136 insertions(+), 74 deletions(-) diff --git a/README.md b/README.md index 48366f3..a381855 100644 --- a/README.md +++ b/README.md @@ -95,24 +95,6 @@ pip install -r requirements.txt ## Inference -### Method 1 - -```python -from model import SenseVoiceSmall - -model_dir = "iic/SenseVoiceSmall" -m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir) - - -res = m.inference( - data_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", - language="zh", # "zn", "en", "yue", "ja", "ko", "nospeech" - use_itn=False, - **kwargs, -) - -print(res) -``` ### Method 2 @@ -159,7 +141,24 @@ res = model.generate( For more usage, please refer to [docs](https://github.com/modelscope/FunASR/blob/main/docs/tutorial/README.md) +### Method 1 +```python +from model import SenseVoiceSmall + +model_dir = "iic/SenseVoiceSmall" +m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir) + + +res = m.inference( + data_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav", + language="zh", # "zn", "en", "yue", "ja", "ko", "nospeech" + use_itn=False, + **kwargs, +) + +print(res) +``` ### Export and Test diff --git a/README_zh.md b/README_zh.md index f8f8a5f..a8ce4d1 100644 --- a/README_zh.md +++ b/README_zh.md @@ -95,8 +95,61 @@ pip install -r requirements.txt ## 推理 + + +### 使用funasr推理 + +支持任意格式音频输入,支持任意时长输入 + +```python +from funasr import AutoModel +from funasr.utils.postprocess_utils import rich_transcription_postprocess + +model_dir = "iic/SenseVoiceSmall" + + +model = AutoModel( + model=model_dir, + vad_model="fsmn-vad", + vad_kwargs={"max_single_segment_time": 30000}, + device="cpu", +) + +# en +res = model.generate( + input=f"{model.model_path}/example/en.mp3", + cache={}, + language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, +) +text = rich_transcription_postprocess(res[0]["text"]) +print(text) +``` + +funasr版本已经集成了vad模型,支持任意时长音频输入,`batch_size_s`单位为秒。 +如果输入均为短音频(小于30s),并且需要批量化推理,为了加快推理效率,可以移除vad模型,并设置`batch_size` + +```python +model = AutoModel(model=model_dir, trust_remote_code=True, device="cuda:0") + +res = model.generate( + input=input_file, + cache={}, + language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" + use_itn=False, + batch_size=64, +) +``` + +更多详细用法,请参考 [文档](https://github.com/modelscope/FunASR/blob/main/docs/tutorial/README.md) + ### 直接推理 +支持任意格式音频输入,输入音频时长限制在30s以下 + ```python from model import SenseVoiceSmall @@ -114,52 +167,6 @@ res = m.inference( print(res) ``` -### 使用funasr推理 - -```python -from funasr import AutoModel -from funasr.utils.postprocess_utils import rich_transcription_postprocess - -model_dir = "iic/SenseVoiceSmall" -input_file = ( - "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" -) - -model = AutoModel(model=model_dir, - vad_model="fsmn-vad", - vad_kwargs={"max_single_segment_time": 30000}, - trust_remote_code=True, device="cuda:0") - -res = model.generate( - input=input_file, - cache={}, - language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" - use_itn=False, - batch_size_s=0, -) - -text = rich_transcription_postprocess(res[0]["text"]) - -print(text) -``` - -funasr版本已经集成了vad模型,支持任意时长音频输入,`batch_size_s`单位为秒。 -如果输入均为短音频,并且需要批量化推理,为了加快推理效率,可以移除vad模型,并设置`batch_size` - -```python -model = AutoModel(model=model_dir, trust_remote_code=True, device="cuda:0") - -res = model.generate( - input=input_file, - cache={}, - language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" - use_itn=False, - batch_size=64, -) -``` - -更多详细用法,请参考 [文档](https://github.com/modelscope/FunASR/blob/main/docs/tutorial/README.md) - ## 服务部署 Undo diff --git a/demo_funasr.py b/demo_funasr.py index 3a36b1f..f10e54b 100644 --- a/demo_funasr.py +++ b/demo_funasr.py @@ -3,26 +3,82 @@ # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. # MIT License (https://opensource.org/licenses/MIT) + from funasr import AutoModel from funasr.utils.postprocess_utils import rich_transcription_postprocess model_dir = "iic/SenseVoiceSmall" -input_file = ( - "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav" -) + model = AutoModel( model=model_dir, - trust_remote_code=True, + vad_model="fsmn-vad", + vad_kwargs={"max_single_segment_time": 30000}, + device="cpu", ) +# en res = model.generate( - input=input_file, + input=f"{model.model_path}/example/en.mp3", cache={}, - language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" - use_itn=False, + language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, +) +text = rich_transcription_postprocess(res[0]["text"]) +print(text) + +# zh +res = model.generate( + input=f"{model.model_path}/example/zh.mp3", + cache={}, + language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, +) +text = rich_transcription_postprocess(res[0]["text"]) +print(text) + +# yue +res = model.generate( + input=f"{model.model_path}/example/yue.mp3", + cache={}, + language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, +) +text = rich_transcription_postprocess(res[0]["text"]) +print(text) + +# ja +res = model.generate( + input=f"{model.model_path}/example/ja.mp3", + cache={}, + language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, +) +text = rich_transcription_postprocess(res[0]["text"]) +print(text) + + +# ko +res = model.generate( + input=f"{model.model_path}/example/ko.mp3", + cache={}, + language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech" + use_itn=True, + batch_size_s=60, + merge_vad=True, # + merge_length_s=15, ) - text = rich_transcription_postprocess(res[0]["text"]) - print(text) diff --git a/finetune.sh b/finetune.sh index 4fad573..7d568fb 100644 --- a/finetune.sh +++ b/finetune.sh @@ -10,7 +10,7 @@ gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') # model_name from model_hub, or model_dir in local path ## option 1, download model automatically -model_name_or_model_dir="iic/SenseVoiceCTC" +model_name_or_model_dir="iic/SenseVoiceSmall" ## option 2, download model by git #local_path_root=${workspace}/modelscope_models