atsr

2025-09-15 14:48:36 +08:00 · 2024-03-04 16:44:31 +08:00 · 2024-03-04 16:44:31 +08:00 · d9e60d9ddc
commit d9e60d9ddc
parent 1a6d9d5cc4
2 changed files with 75 additions and 24 deletions
--- a/examples/industrial_data_pretraining/lcbnet/README.md
+++ b/examples/industrial_data_pretraining/lcbnet/README.md
@ -91,6 +91,79 @@ finetune-support: True



+## 基于ModelScope进行推理
+
+- 推理支持音频格式如下：
+  - wav文件路径，例如：data/test/asr_example.wav
+  - pcm文件路径，例如：data/test/asr_example.pcm
+  - ark文件路径，例如：data/test/data.ark
+  - wav文件url，例如：https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav
+  - wav二进制数据，格式bytes，例如：用户直接从文件里读出bytes数据或者是麦克风录出bytes数据。
+  - 已解析的audio音频，例如：audio, rate = soundfile.read("asr_example_zh.wav")，类型为numpy.ndarray或者torch.Tensor。
+  - wav.scp文件，需符合如下要求(以下分别为sound和kaldi_ark格式)：
+
+```sh
+cat wav.scp
+asr_example1  data/test/asr_example1.wav
+asr_example2  data/test/asr_example2.wav
+
+cat wav.scp
+asr_example1  data/test/data_wav.ark:22
+asr_example2  data/test/data_wav.ark:90445
+...
+```
+
+- 推理支持OCR预测文本格式如下：
+  - ocr.txt文件，需符合如下要求：
+```sh
+cat ocr.txt
+asr_example1  ANIMAL <blank> RIGHTS <blank> MANAGER <blank> PLOEG
+asr_example2  UNIVERSITY <blank> CAMPUS <blank> DEANO
+...
+```
+
+- 若输入格式wav文件和ocr文件均为url，api调用方式可参考如下范例：
+
+```python
+from funasr import AutoModel
+
+model = AutoModel(model="iic/LCB-NET",
+                  model_revision="v2.0.0")
+res = model.generate(input=("https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav","https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/ocr.txt"),data_type=("sound", "text"))
+```
+
+
+## 复现论文中的结果
+```python
+python -m funasr.bin.inference \
+        --config-path=${file_dir} \
+        --config-name="config.yaml" \
+        ++init_param=${file_dir}/model.pt \
+        ++tokenizer_conf.token_list=${file_dir}/tokens.txt \
+        ++input=[${_logdir}/wav.scp,${_logdir}/ocr.txt] \
+        +data_type='["kaldi_ark", "text"]' \
+        ++tokenizer_conf.bpemodel=${file_dir}/bpe.pt \
+        ++output_dir="${inference_dir}/results" \
+        ++device="${inference_device}" \
+        ++ncpu=1 \
+        ++disable_log=true
+
+```
+
+
+识别结果输出路径结构如下：
+
+```sh
+tree output_dir/
+output_dir/
+└── 1best_recog
+    ├── text
+    └── token
+```
+
+token：语音识别结果文件
+
+可以使用funasr里面提供的run_bwer_recall.sh计算WER、BWER、UWER和Recall。


 ## 相关论文以及引用信息
--- a/examples/industrial_data_pretraining/lcbnet/demo.py
+++ b/examples/industrial_data_pretraining/lcbnet/demo.py
@ -6,30 +6,8 @@
 from funasr import AutoModel

 model = AutoModel(model="iic/LCB-NET",
-                  model_revision="v1.0.0")
+                  model_revision="v2.0.0")

-
-# example1
 res = model.generate(input=("https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/asr_example.wav","https://www.modelscope.cn/api/v1/models/iic/LCB-NET/repo?Revision=master&FilePath=example/ocr.txt"),data_type=("sound", "text"))

-print(res)
-
-
-'''
-# tensor or numpy as input
-# example2
-import torchaudio
-import os
-wav_file = os.path.join(model.model_path, "example/asr_example.wav")
-input_tensor, sample_rate = torchaudio.load(wav_file)
-input_tensor = input_tensor.mean(0)
-res = model.generate(input=[input_tensor], batch_size_s=300, is_final=True)
-
-
-# example3
-import soundfile
-
-wav_file = os.path.join(model.model_path, "example/asr_example.wav")
-speech, sample_rate = soundfile.read(wav_file)
-res = model.generate(input=[speech], batch_size_s=300, is_final=True)
-'''
+print(res)