From a694d92d37f63c800291c4c9e5b64054afa7278c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= <zhifu.gzf@alibaba-inc.com>
Date: Tue, 16 Jul 2024 13:59:29 +0800
Subject: [PATCH] sensevoice

---
 README.md      | 35 +++++++++---------
 README_zh.md   | 99 +++++++++++++++++++++++++++-----------------------
 demo_funasr.py | 74 ++++++++++++++++++++++++++++++++-----
 finetune.sh    |  2 +-
 4 files changed, 136 insertions(+), 74 deletions(-)

diff --git a/README.md b/README.md
index 48366f3..a381855 100644
--- a/README.md
+++ b/README.md
@@ -95,24 +95,6 @@ pip install -r requirements.txt
 
 ## Inference
 
-### Method 1
-
-```python
-from model import SenseVoiceSmall
-
-model_dir = "iic/SenseVoiceSmall"
-m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir)
-
-
-res = m.inference(
-    data_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
-    language="zh", # "zn", "en", "yue", "ja", "ko", "nospeech"
-    use_itn=False,
-    **kwargs,
-)
-
-print(res)
-```
 
 ### Method 2
 
@@ -159,7 +141,24 @@ res = model.generate(
 
 For more usage, please refer to [docs](https://github.com/modelscope/FunASR/blob/main/docs/tutorial/README.md)
 
+### Method 1
 
+```python
+from model import SenseVoiceSmall
+
+model_dir = "iic/SenseVoiceSmall"
+m, kwargs = SenseVoiceSmall.from_pretrained(model=model_dir)
+
+
+res = m.inference(
+    data_in="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav",
+    language="zh", # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=False,
+    **kwargs,
+)
+
+print(res)
+```
 
 ### Export and Test
 
diff --git a/README_zh.md b/README_zh.md
index f8f8a5f..a8ce4d1 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -95,8 +95,61 @@ pip install -r requirements.txt
 
 ## 推理
 
+
+
+### 使用funasr推理
+
+支持任意格式音频输入，支持任意时长输入
+
+```python
+from funasr import AutoModel
+from funasr.utils.postprocess_utils import rich_transcription_postprocess
+
+model_dir = "iic/SenseVoiceSmall"
+
+
+model = AutoModel(
+    model=model_dir,
+    vad_model="fsmn-vad",
+    vad_kwargs={"max_single_segment_time": 30000},
+    device="cpu",
+)
+
+# en
+res = model.generate(
+    input=f"{model.model_path}/example/en.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+```
+
+funasr版本已经集成了vad模型，支持任意时长音频输入，`batch_size_s`单位为秒。
+如果输入均为短音频（小于30s），并且需要批量化推理，为了加快推理效率，可以移除vad模型，并设置`batch_size`
+
+```python
+model = AutoModel(model=model_dir, trust_remote_code=True, device="cuda:0")
+
+res = model.generate(
+    input=input_file,
+    cache={},
+    language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=False,
+    batch_size=64, 
+)
+```
+
+更多详细用法，请参考 [文档](https://github.com/modelscope/FunASR/blob/main/docs/tutorial/README.md)
+
 ### 直接推理
 
+支持任意格式音频输入，输入音频时长限制在30s以下
+
 ```python
 from model import SenseVoiceSmall
 
@@ -114,52 +167,6 @@ res = m.inference(
 print(res)
 ```
 
-### 使用funasr推理
-
-```python
-from funasr import AutoModel
-from funasr.utils.postprocess_utils import rich_transcription_postprocess
-
-model_dir = "iic/SenseVoiceSmall"
-input_file = (
-    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
-)
-
-model = AutoModel(model=model_dir,
-                  vad_model="fsmn-vad",
-                  vad_kwargs={"max_single_segment_time": 30000},
-                  trust_remote_code=True, device="cuda:0")
-
-res = model.generate(
-    input=input_file,
-    cache={},
-    language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
-    use_itn=False,
-    batch_size_s=0, 
-)
-
-text = rich_transcription_postprocess(res[0]["text"])
-
-print(text)
-```
-
-funasr版本已经集成了vad模型，支持任意时长音频输入，`batch_size_s`单位为秒。
-如果输入均为短音频，并且需要批量化推理，为了加快推理效率，可以移除vad模型，并设置`batch_size`
-
-```python
-model = AutoModel(model=model_dir, trust_remote_code=True, device="cuda:0")
-
-res = model.generate(
-    input=input_file,
-    cache={},
-    language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
-    use_itn=False,
-    batch_size=64, 
-)
-```
-
-更多详细用法，请参考 [文档](https://github.com/modelscope/FunASR/blob/main/docs/tutorial/README.md)
-
 ## 服务部署
 
 Undo
diff --git a/demo_funasr.py b/demo_funasr.py
index 3a36b1f..f10e54b 100644
--- a/demo_funasr.py
+++ b/demo_funasr.py
@@ -3,26 +3,82 @@
 # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
 #  MIT License  (https://opensource.org/licenses/MIT)
 
+
 from funasr import AutoModel
 from funasr.utils.postprocess_utils import rich_transcription_postprocess
 
 model_dir = "iic/SenseVoiceSmall"
-input_file = (
-    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
-)
+
 
 model = AutoModel(
     model=model_dir,
-    trust_remote_code=True,
+    vad_model="fsmn-vad",
+    vad_kwargs={"max_single_segment_time": 30000},
+    device="cpu",
 )
 
+# en
 res = model.generate(
-    input=input_file,
+    input=f"{model.model_path}/example/en.mp3",
     cache={},
-    language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
-    use_itn=False,
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+
+# zh
+res = model.generate(
+    input=f"{model.model_path}/example/zh.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+
+# yue
+res = model.generate(
+    input=f"{model.model_path}/example/yue.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+
+# ja
+res = model.generate(
+    input=f"{model.model_path}/example/ja.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
+)
+text = rich_transcription_postprocess(res[0]["text"])
+print(text)
+
+
+# ko
+res = model.generate(
+    input=f"{model.model_path}/example/ko.mp3",
+    cache={},
+    language="auto",  # "zn", "en", "yue", "ja", "ko", "nospeech"
+    use_itn=True,
+    batch_size_s=60,
+    merge_vad=True,  #
+    merge_length_s=15,
 )
-
 text = rich_transcription_postprocess(res[0]["text"])
-
 print(text)
diff --git a/finetune.sh b/finetune.sh
index 4fad573..7d568fb 100644
--- a/finetune.sh
+++ b/finetune.sh
@@ -10,7 +10,7 @@ gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
 # model_name from model_hub, or model_dir in local path
 
 ## option 1, download model automatically
-model_name_or_model_dir="iic/SenseVoiceCTC"
+model_name_or_model_dir="iic/SenseVoiceSmall"
 
 ## option 2, download model by git
 #local_path_root=${workspace}/modelscope_models