sensevoice

2025-09-15 15:08:35 +08:00 · 2024-07-16 13:59:29 +08:00 · 2024-07-16 13:59:29 +08:00 · a694d92d37
commit a694d92d37
parent 43c47a7312
4 changed files with 136 additions and 74 deletions
--- a/README.md
+++ b/README.md
@ -95,24 +95,6 @@ pip install -r requirements.txt

 ## Inference

-### Method 1
-
-```python
-from model import SenseVoiceSmall
-
-model_dir = "iic/SenseVoiceSmall"
-m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir)
-
-
-res = m.inference(
-    data_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
-    language="zh", # "zn", "en", "yue", "ja", "ko", "nospeech"
-    use_itn=False,
-    **kwargs,
-)
-
-print(res)
-```

 ### Method 2

@ -159,7 +141,24 @@ res = model.generate(

 For more usage, please refer to [docs](https://github.com/modelscope/FunASR/blob/main/docs/tutorial/README.md)

+### Method 1

+```python
+from model import SenseVoiceSmall
+
+model_dir = "iic/SenseVoiceSmall"
+m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir)
+
+
+res = m.inference(
+    data_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
+    language="zh", # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=False,
+    **kwargs,
+)
+
+print(res)
+```

 ### Export and Test

--- a/README_zh.md
+++ b/README_zh.md
@ -95,8 +95,61 @@ pip install -r requirements.txt

 ## 推理

+
+
+### 使用funasr推理
+
+支持任意格式音频输入，支持任意时长输入
+
+```python
+from funasr import AutoModel
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
+
+model_dir = "iic/SenseVoiceSmall"
+
+
+model = AutoModel(
+    model=model_dir,
+    vad_model="fsmn-vad",
+    vad_kwargs={"max_single_segment_time": 30000},
+    device="cpu",
+)
+
+# en
+res = model.generate(
+    input=f"{model.model_path}/example/en.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+```
+
+funasr版本已经集成了vad模型，支持任意时长音频输入，`batch_size_s`单位为秒。
+如果输入均为短音频（小于30s），并且需要批量化推理，为了加快推理效率，可以移除vad模型，并设置`batch_size`
+
+```python
+model = AutoModel(model=model_dir, trust_remote_code=True, device="cuda:0")
+
+res = model.generate(
+    input=input_file,
+    cache={},
+    language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=False,
+    batch_size=64, 
+)
+```
+
+更多详细用法，请参考 [文档](https://github.com/modelscope/FunASR/blob/main/docs/tutorial/README.md)
+
 ### 直接推理

+支持任意格式音频输入，输入音频时长限制在30s以下
+
 ```python
 from model import SenseVoiceSmall

@ -114,52 +167,6 @@ res = m.inference(
 print(res)
 ```

-### 使用funasr推理
-
-```python
-from funasr import AutoModel
-from funasr.utils.postprocess_utils import rich_transcription_postprocess
-
-model_dir = "iic/SenseVoiceSmall"
-input_file = (
-    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
-)
-
-model = AutoModel(model=model_dir,
-                  vad_model="fsmn-vad",
-                  vad_kwargs={"max_single_segment_time": 30000},
-                  trust_remote_code=True, device="cuda:0")
-
-res = model.generate(
-    input=input_file,
-    cache={},
-    language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
-    use_itn=False,
-    batch_size_s=0, 
-)
-
-text = rich_transcription_postprocess(res[0]["text"])
-
-print(text)
-```
-
-funasr版本已经集成了vad模型，支持任意时长音频输入，`batch_size_s`单位为秒。
-如果输入均为短音频，并且需要批量化推理，为了加快推理效率，可以移除vad模型，并设置`batch_size`
-
-```python
-model = AutoModel(model=model_dir, trust_remote_code=True, device="cuda:0")
-
-res = model.generate(
-    input=input_file,
-    cache={},
-    language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
-    use_itn=False,
-    batch_size=64, 
-)
-```
-
-更多详细用法，请参考 [文档](https://github.com/modelscope/FunASR/blob/main/docs/tutorial/README.md)
-
 ## 服务部署

 Undo
--- a/demo_funasr.py
+++ b/demo_funasr.py
@ -3,26 +3,82 @@
 # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
 #  MIT License  (https://opensource.org/licenses/MIT)

+
 from funasr import AutoModel
 from funasr.utils.postprocess_utils import rich_transcription_postprocess

 model_dir = "iic/SenseVoiceSmall"
-input_file = (
-    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
-)
+

 model = AutoModel(
    model=model_dir,
-    trust_remote_code=True,
+    vad_model="fsmn-vad",
+    vad_kwargs={"max_single_segment_time": 30000},
+    device="cpu",
 )

+# en
 res = model.generate(
-    input=input_file,
+    input=f"{model.model_path}/example/en.mp3",
    cache={},
    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
-    use_itn=False,
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+
+# zh
+res = model.generate(
+    input=f"{model.model_path}/example/zh.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+
+# yue
+res = model.generate(
+    input=f"{model.model_path}/example/yue.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+
+# ja
+res = model.generate(
+    input=f"{model.model_path}/example/ja.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+
+
+# ko
+res = model.generate(
+    input=f"{model.model_path}/example/ko.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
 )
-
 text = rich_transcription_postprocess(res[0]["text"])
-
 print(text)
--- a/finetune.sh
+++ b/finetune.sh
@ -10,7 +10,7 @@ gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 # model_name from model_hub, or model_dir in local path

 ## option 1, download model automatically
-model_name_or_model_dir="iic/SenseVoiceCTC"
+model_name_or_model_dir="iic/SenseVoiceSmall"

 ## option 2, download model by git
 #local_path_root=${workspace}/modelscope_models