diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 2497ac2f4..75651b690 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,10 +18,6 @@ jobs:
with:
docs-folder: "docs/"
pre-build-command: "pip install sphinx-markdown-tables nbsphinx jinja2 recommonmark sphinx_rtd_theme"
- - uses: ammaraskar/sphinx-action@master
- with:
- docs-folder: "docs_cn/"
- pre-build-command: "pip install sphinx-markdown-tables nbsphinx jinja2 recommonmark sphinx_rtd_theme"
- name: deploy copy
if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev_wjm' || github.ref == 'refs/heads/dev_lyh'
@@ -31,9 +27,6 @@ jobs:
mkdir public/en
touch public/en/.nojekyll
cp -r docs/_build/html/* public/en/
- mkdir public/cn
- touch public/cn/.nojekyll
- cp -r docs_cn/_build/html/* public/cn/
mkdir public/m2met2
touch public/m2met2/.nojekyll
cp -r docs_m2met2/_build/html/* public/m2met2/
diff --git a/.gitignore b/.gitignore
index 13d2fff66..33b8c3979 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,4 +12,8 @@ export/*
*.pyc
.eggs
MaaS-lib
-.gitignore
\ No newline at end of file
+.gitignore
+.egg*
+dist
+build
+funasr.egg-info
\ No newline at end of file
diff --git a/README.md b/README.md
index 03156f31e..29ddd4ac0 100644
--- a/README.md
+++ b/README.md
@@ -12,13 +12,13 @@
[**News**](https://github.com/alibaba-damo-academy/FunASR#whats-new)
| [**Highlights**](#highlights)
| [**Installation**](#installation)
-| [**Docs_EN**](https://alibaba-damo-academy.github.io/FunASR/en/index.html)
+| [**Docs**](https://alibaba-damo-academy.github.io/FunASR/en/index.html)
| [**Tutorial**](https://github.com/alibaba-damo-academy/FunASR/wiki#funasr%E7%94%A8%E6%88%B7%E6%89%8B%E5%86%8C)
| [**Papers**](https://github.com/alibaba-damo-academy/FunASR#citations)
| [**Runtime**](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime)
| [**Model Zoo**](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/modelscope_models.md)
| [**Contact**](#contact)
-
+|
[**M2MET2.0 Guidence_CN**](https://alibaba-damo-academy.github.io/FunASR/m2met2_cn/index.html)
| [**M2MET2.0 Guidence_EN**](https://alibaba-damo-academy.github.io/FunASR/m2met2/index.html)
diff --git a/docs/FQA.md b/docs/FQA.md
new file mode 100644
index 000000000..46c5aa38a
--- /dev/null
+++ b/docs/FQA.md
@@ -0,0 +1,22 @@
+# FQA
+
+## How to use VAD model by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/236)
+
+## How to use Punctuation model by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/238)
+
+## How to use Parafomrer model for streaming by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/241)
+
+## How to use vad, asr and punc model by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/278)
+
+## How to combine vad, asr, punc and nnlm models inside modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/134)
+
+## How to combine timestamp prediction model by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/246)
+
+## How to switch decoding mode between online and offline for UniASR model
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/151)
\ No newline at end of file
diff --git a/docs/benchmark/benchmark_libtorch.md b/docs/benchmark/benchmark_libtorch.md
new file mode 120000
index 000000000..f1cd73c53
--- /dev/null
+++ b/docs/benchmark/benchmark_libtorch.md
@@ -0,0 +1 @@
+../../funasr/runtime/python/benchmark_libtorch.md
\ No newline at end of file
diff --git a/docs/benchmark/benchmark_onnx.md b/docs/benchmark/benchmark_onnx.md
new file mode 120000
index 000000000..14e2fbebf
--- /dev/null
+++ b/docs/benchmark/benchmark_onnx.md
@@ -0,0 +1 @@
+../../funasr/runtime/python/benchmark_onnx.md
\ No newline at end of file
diff --git a/docs/images/dingding.jpg b/docs/images/dingding.jpg
index aea2b06ef..6ac3ab8a5 100644
Binary files a/docs/images/dingding.jpg and b/docs/images/dingding.jpg differ
diff --git a/docs/index.rst b/docs/index.rst
index e5b9ab8f9..14c9525cf 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -47,6 +47,7 @@ FunASR hopes to build a bridge between academic research and industrial applicat
./modescope_pipeline/punc_pipeline.md
./modescope_pipeline/tp_pipeline.md
./modescope_pipeline/sv_pipeline.md
+ ./modescope_pipeline/sd_pipeline.md
./modescope_pipeline/lm_pipeline.md
.. toctree::
@@ -61,13 +62,24 @@ FunASR hopes to build a bridge between academic research and industrial applicat
./runtime/grpc_cpp.md
./runtime/websocket_python.md
+.. toctree::
+ :maxdepth: 1
+ :caption: Benchmark and Leadboard
+
+ ./benchmark/benchmark_onnx.md
+ ./benchmark/benchmark_libtorch.md
+
.. toctree::
:maxdepth: 1
:caption: Papers
./papers.md
+.. toctree::
+ :maxdepth: 1
+ :caption: FQA
+ ./FQA.md
Indices and tables
diff --git a/docs/modelscope_models.md b/docs/modelscope_models.md
index b35d625d2..3538ae0d3 100644
--- a/docs/modelscope_models.md
+++ b/docs/modelscope_models.md
@@ -80,7 +80,7 @@ Here we provided several pretrained models on different datasets. The details of
| [Xvector](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary) | CNCeleb (1,200 hours) | 17.5M | 3465 | Xvector, speaker verification, Chinese |
| [Xvector](https://www.modelscope.cn/models/damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/summary) | CallHome (60 hours) | 61M | 6135 | Xvector, speaker verification, English |
-### Speaker diarization Models
+### Speaker Diarization Models
| Model Name | Training Data | Parameters | Notes |
|:----------------------------------------------------------------------------------------------------------------:|:-------------------:|:----------:|:------|
diff --git a/docs/modescope_pipeline/asr_pipeline.md b/docs/modescope_pipeline/asr_pipeline.md
index 3dc0bd0ca..8b6b24d1f 100644
--- a/docs/modescope_pipeline/asr_pipeline.md
+++ b/docs/modescope_pipeline/asr_pipeline.md
@@ -1,20 +1,196 @@
# Speech Recognition
+> **Note**:
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take model of Paraformer and Paraformer-online as example to demonstrate the usage.
+
## Inference
### Quick start
+#### [Paraformer model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
-#### Inference with you data
+inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
+)
-#### Inference with multi-threads on CPU
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
+```
+#### [Paraformer-online model](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/summary)
+```python
+inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
+ )
+import soundfile
+speech, sample_rate = soundfile.read("example/asr_example.wav")
+
+param_dict = {"cache": dict(), "is_final": False}
+chunk_stride = 7680# 480ms
+# first chunk, 480ms
+speech_chunk = speech[0:chunk_stride]
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+# next chunk, 480ms
+speech_chunk = speech[chunk_stride:chunk_stride+chunk_stride]
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+```
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/241)
+
+#### [UniASR model](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
+There are three decoding mode for UniASR model(`fast`、`normal`、`offline`), for more model detailes, please refer to [docs](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
+```python
+decoding_model = "fast" # "fast"、"normal"、"offline"
+inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model='damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825',
+ param_dict={"decoding_model": decoding_model})
+
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
+```
+The decoding mode of `fast` and `normal`
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/151)
+#### [RNN-T-online model]()
+Undo
+
+#### API-reference
+##### define pipeline
+- `task`: `Tasks.auto_speech_recognition`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `ngpu`: 1 (Defalut), decoding on GPU. If ngpu=0, decoding on CPU
+- `ncpu`: 1 (Defalut), sets the number of threads used for intraop parallelism on CPU
+- `output_dir`: None (Defalut), the output path of results if set
+- `batch_size`: 1 (Defalut), batch size when decoding
+##### infer pipeline
+- `audio_in`: the input to decode, which could be:
+ - wav_path, `e.g.`: asr_example.wav,
+ - pcm_path, `e.g.`: asr_example.pcm,
+ - audio bytes stream, `e.g.`: bytes data from a microphone
+ - audio sample point,`e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, the dtype is numpy.ndarray or torch.Tensor
+ - wav.scp, kaldi style wav list (`wav_id \t wav_path``), `e.g.`:
+ ```cat wav.scp
+ asr_example1 ./audios/asr_example1.wav
+ asr_example2 ./audios/asr_example2.wav
+ ```
+ In this case of `wav.scp` input, `output_dir` must be set to save the output results
+- `audio_fs`: audio sampling rate, only set when audio_in is pcm audio
+- `output_dir`: None (Defalut), the output path of results if set
+
+### Inference with multi-thread CPUs or multi GPUs
+FunASR also offer recipes [infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
+
+- Setting parameters in `infer.sh`
+ - model: # model name on ModelScope
+ - data_dir: # the dataset dir needs to include `${data_dir}/wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
+ - output_dir: # result dir
+ - batch_size: # batchsize of inference
+ - gpu_inference: # whether to perform gpu decoding, set false for cpu decoding
+ - gpuid_list: # set gpus, e.g., gpuid_list="0,1"
+ - njob: # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
+
+- Decode with multi GPUs:
+```shell
+ bash infer.sh \
+ --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+ --data_dir "./data/test" \
+ --output_dir "./results" \
+ --batch_size 64 \
+ --gpu_inference true \
+ --gpuid_list "0,1"
+```
+- Decode with multi-thread CPUs:
+```shell
+ bash infer.sh \
+ --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+ --data_dir "./data/test" \
+ --output_dir "./results" \
+ --gpu_inference false \
+ --njob 64
+```
+
+- Results
+
+The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
+
+If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
-#### Inference with multi GPU
## Finetune with pipeline
### Quick start
+[finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py)
+```python
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from modelscope.msdatasets.audio.asr_dataset import ASRDataset
+
+def modelscope_finetune(params):
+ if not os.path.exists(params.output_dir):
+ os.makedirs(params.output_dir, exist_ok=True)
+ # dataset split ["train", "validation"]
+ ds_dict = ASRDataset.load(params.data_path, namespace='speech_asr')
+ kwargs = dict(
+ model=params.model,
+ data_dir=ds_dict,
+ dataset_type=params.dataset_type,
+ work_dir=params.output_dir,
+ batch_bins=params.batch_bins,
+ max_epoch=params.max_epoch,
+ lr=params.lr)
+ trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+ trainer.train()
+
+
+if __name__ == '__main__':
+ from funasr.utils.modelscope_param import modelscope_args
+ params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+ params.output_dir = "./checkpoint" # 模型保存路径
+ params.data_path = "speech_asr_aishell1_trainsets" # 数据路径,可以为modelscope中已上传数据,也可以是本地数据
+ params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large
+ params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒,
+ params.max_epoch = 50 # 最大训练轮数
+ params.lr = 0.00005 # 设置学习率
+
+ modelscope_finetune(params)
+```
+
+```shell
+python finetune.py &> log.txt &
+```
### Finetune with your data
-## Inference with your finetuned model
+- Modify finetune training related parameters in [finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py)
+ - output_dir: # result dir
+ - data_dir: # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
+ - dataset_type: # for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
+ - batch_bins: # batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms
+ - max_epoch: # number of training epoch
+ - lr: # learning rate
+- Then you can run the pipeline to finetune with:
+```shell
+python finetune.py
+```
+If you want finetune with multi-GPUs, you could:
+```shell
+CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch --nproc_per_node 2 finetune.py > log.txt 2>&1
+```
+## Inference with your finetuned model
+- Modify inference related parameters in [infer_after_finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py)
+ - modelscope_model_name: # model name on ModelScope
+ - output_dir: # result dir
+ - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
+ - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
+ - batch_size: # batchsize of inference
+
+- Then you can run the pipeline to finetune with:
+```python
+ python infer_after_finetune.py
+```
diff --git a/docs/modescope_pipeline/lm_pipeline.md b/docs/modescope_pipeline/lm_pipeline.md
index cb818719e..c4090ece6 100644
--- a/docs/modescope_pipeline/lm_pipeline.md
+++ b/docs/modescope_pipeline/lm_pipeline.md
@@ -1,10 +1,10 @@
-# Speech Recognition
+# Language Models
## Inference with pipeline
### Quick start
-#### Inference with you data
-#### Inference with multi-threads on CPU
-#### Inference with multi GPU
+### Inference with you data
+### Inference with multi-threads on CPU
+### Inference with multi GPU
## Finetune with pipeline
### Quick start
diff --git a/docs/modescope_pipeline/punc_pipeline.md b/docs/modescope_pipeline/punc_pipeline.md
index 67ee6950a..a0203d707 100644
--- a/docs/modescope_pipeline/punc_pipeline.md
+++ b/docs/modescope_pipeline/punc_pipeline.md
@@ -4,11 +4,11 @@
### Quick start
-#### Inference with you data
+### Inference with you data
-#### Inference with multi-threads on CPU
+### Inference with multi-threads on CPU
-#### Inference with multi GPU
+### Inference with multi GPU
## Finetune with pipeline
diff --git a/docs/modescope_pipeline/quick_start.md b/docs/modescope_pipeline/quick_start.md
index ab46a7c73..b1614f511 100644
--- a/docs/modescope_pipeline/quick_start.md
+++ b/docs/modescope_pipeline/quick_start.md
@@ -59,8 +59,7 @@ from modelscope.utils.constant import Tasks
inference_pipeline = pipeline(
task=Tasks.speech_timestamp,
- model='damo/speech_timestamp_prediction-v1-16k-offline',
- output_dir='./tmp')
+ model='damo/speech_timestamp_prediction-v1-16k-offline',)
rec_result = inference_pipeline(
audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav',
@@ -88,6 +87,71 @@ rec_result = inference_sv_pipline(audio_in=('https://isv-data.oss-cn-hangzhou.al
print(rec_result["scores"][0])
```
+### Speaker diarization
+#### SOND
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_diar_pipline = pipeline(
+ mode="sond_demo",
+ num_workers=0,
+ task=Tasks.speaker_diarization,
+ diar_model_config="sond.yaml",
+ model='damo/speech_diarization_sond-en-us-callhome-8k-n16k4-pytorch',
+ sv_model="damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch",
+ sv_model_revision="master",
+)
+
+audio_list=[
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav",
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_A.wav",
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B.wav",
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B1.wav"
+]
+
+results = inference_diar_pipline(audio_in=audio_list)
+print(results)
+```
+
+### FAQ
+#### How to switch device from GPU to CPU with pipeline
+
+The pipeline defaults to decoding with GPU (`ngpu=1`) when GPU is available. If you want to switch to CPU, you could set `ngpu=0`
+```python
+inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
+ ngpu=0,
+)
+```
+
+#### How to infer from local model path
+Download model to local dir, by modelscope-sdk
+
+```python
+from modelscope.hub.snapshot_download import snapshot_download
+
+local_dir_root = "./models_from_modelscope"
+model_dir = snapshot_download('damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch', cache_dir=local_dir_root)
+```
+
+Or download model to local dir, by git lfs
+```shell
+git lfs install
+# git clone https://www.modelscope.cn//.git
+git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git
+```
+
+Infer with local model path
+```python
+local_dir_root = "./models_from_modelscope/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model=local_dir_root,
+)
+```
+
## Finetune with pipeline
### Speech Recognition
#### Paraformer model
@@ -132,6 +196,10 @@ if __name__ == '__main__':
```shell
python finetune.py &> log.txt &
```
+
+### FAQ
+### Multi GPUs training and distributed training
+
If you want finetune with multi-GPUs, you could:
```shell
CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch --nproc_per_node 2 finetune.py > log.txt 2>&1
diff --git a/docs/modescope_pipeline/sd_pipeline.md b/docs/modescope_pipeline/sd_pipeline.md
new file mode 100644
index 000000000..1330fe6f7
--- /dev/null
+++ b/docs/modescope_pipeline/sd_pipeline.md
@@ -0,0 +1,20 @@
+# Speaker Diarization
+
+## Inference with pipeline
+
+### Quick start
+
+### Inference with you data
+
+### Inference with multi-threads on CPU
+
+### Inference with multi GPU
+
+## Finetune with pipeline
+
+### Quick start
+
+### Finetune with your data
+
+## Inference with your finetuned model
+
diff --git a/docs/modescope_pipeline/sv_pipeline.md b/docs/modescope_pipeline/sv_pipeline.md
index 6ce8c6a2c..c57db3890 100644
--- a/docs/modescope_pipeline/sv_pipeline.md
+++ b/docs/modescope_pipeline/sv_pipeline.md
@@ -4,11 +4,11 @@
### Quick start
-#### Inference with you data
+### Inference with you data
-#### Inference with multi-threads on CPU
+### Inference with multi-threads on CPU
-#### Inference with multi GPU
+### Inference with multi GPU
## Finetune with pipeline
diff --git a/docs/modescope_pipeline/tp_pipeline.md b/docs/modescope_pipeline/tp_pipeline.md
index fad55e3b9..9b1719bf3 100644
--- a/docs/modescope_pipeline/tp_pipeline.md
+++ b/docs/modescope_pipeline/tp_pipeline.md
@@ -4,11 +4,11 @@
### Quick start
-#### Inference with you data
+### Inference with you data
-#### Inference with multi-threads on CPU
+### Inference with multi-threads on CPU
-#### Inference with multi GPU
+### Inference with multi GPU
## Finetune with pipeline
diff --git a/docs/modescope_pipeline/vad_pipeline.md b/docs/modescope_pipeline/vad_pipeline.md
index 5dcbe59a7..9d9b77a70 100644
--- a/docs/modescope_pipeline/vad_pipeline.md
+++ b/docs/modescope_pipeline/vad_pipeline.md
@@ -1,14 +1,107 @@
# Voice Activity Detection
-## Inference with pipeline
+> **Note**:
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take model of FSMN-VAD as example to demonstrate the usage.
+
+## Inference
### Quick start
+#### [FSMN-VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary)
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
-#### Inference with you data
+inference_pipeline = pipeline(
+ task=Tasks.voice_activity_detection,
+ model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
+)
-#### Inference with multi-threads on CPU
+segments_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav')
+print(segments_result)
+```
+#### [FSMN-VAD-online model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary)
+```python
+inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
+ )
+import soundfile
+speech, sample_rate = soundfile.read("example/asr_example.wav")
+
+param_dict = {"in_cache": dict(), "is_final": False}
+chunk_stride = 1600# 100ms
+# first chunk, 100ms
+speech_chunk = speech[0:chunk_stride]
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+# next chunk, 480ms
+speech_chunk = speech[chunk_stride:chunk_stride+chunk_stride]
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+```
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/236)
+
+
+#### API-reference
+##### define pipeline
+- `task`: `Tasks.auto_speech_recognition`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `ngpu`: 1 (Defalut), decoding on GPU. If ngpu=0, decoding on CPU
+- `ncpu`: 1 (Defalut), sets the number of threads used for intraop parallelism on CPU
+- `output_dir`: None (Defalut), the output path of results if set
+- `batch_size`: 1 (Defalut), batch size when decoding
+##### infer pipeline
+- `audio_in`: the input to decode, which could be:
+ - wav_path, `e.g.`: asr_example.wav,
+ - pcm_path, `e.g.`: asr_example.pcm,
+ - audio bytes stream, `e.g.`: bytes data from a microphone
+ - audio sample point,`e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, the dtype is numpy.ndarray or torch.Tensor
+ - wav.scp, kaldi style wav list (`wav_id \t wav_path``), `e.g.`:
+ ```cat wav.scp
+ asr_example1 ./audios/asr_example1.wav
+ asr_example2 ./audios/asr_example2.wav
+ ```
+ In this case of `wav.scp` input, `output_dir` must be set to save the output results
+- `audio_fs`: audio sampling rate, only set when audio_in is pcm audio
+- `output_dir`: None (Defalut), the output path of results if set
+
+### Inference with multi-thread CPUs or multi GPUs
+FunASR also offer recipes [infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE//infer.sh) to decode with multi-thread CPUs, or multi GPUs.
+
+- Setting parameters in `infer.sh`
+ - model: # model name on ModelScope
+ - data_dir: # the dataset dir needs to include `${data_dir}/wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
+ - output_dir: # result dir
+ - batch_size: # batchsize of inference
+ - gpu_inference: # whether to perform gpu decoding, set false for cpu decoding
+ - gpuid_list: # set gpus, e.g., gpuid_list="0,1"
+ - njob: # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
+
+- Decode with multi GPUs:
+```shell
+ bash infer.sh \
+ --model "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" \
+ --data_dir "./data/test" \
+ --output_dir "./results" \
+ --gpu_inference true \
+ --gpuid_list "0,1"
+```
+- Decode with multi-thread CPUs:
+```shell
+ bash infer.sh \
+ --model "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" \
+ --data_dir "./data/test" \
+ --output_dir "./results" \
+ --gpu_inference false \
+ --njob 64
+```
+
+- Results
+
+The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
+
+If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
-#### Inference with multi GPU
## Finetune with pipeline
diff --git a/docs/recipe/sd_recipe.md b/docs/recipe/sd_recipe.md
new file mode 100644
index 000000000..90eb4b310
--- /dev/null
+++ b/docs/recipe/sd_recipe.md
@@ -0,0 +1,129 @@
+# Speaker Diarization
+Here we take "Training a paraformer model from scratch using the AISHELL-1 dataset" as an example to introduce how to use FunASR. According to this example, users can similarly employ other datasets (such as AISHELL-2 dataset, etc.) to train other models (such as conformer, transformer, etc.).
+
+## Overall Introduction
+We provide a recipe `egs/aishell/paraformer/run.sh` for training a paraformer model on AISHELL-1 dataset. This recipe consists of five stages, supporting training on multiple GPUs and decoding by CPU or GPU. Before introducing each stage in detail, we first explain several parameters which should be set by users.
+- `CUDA_VISIBLE_DEVICES`: visible gpu list
+- `gpu_num`: the number of GPUs used for training
+- `gpu_inference`: whether to use GPUs for decoding
+- `njob`: for CPU decoding, indicating the total number of CPU jobs; for GPU decoding, indicating the number of jobs on each GPU
+- `data_aishell`: the raw path of AISHELL-1 dataset
+- `feats_dir`: the path for saving processed data
+- `nj`: the number of jobs for data preparation
+- `speed_perturb`: the range of speech perturbed
+- `exp_dir`: the path for saving experimental results
+- `tag`: the suffix of experimental result directory
+
+## Stage 0: Data preparation
+This stage processes raw AISHELL-1 dataset `$data_aishell` and generates the corresponding `wav.scp` and `text` in `$feats_dir/data/xxx`. `xxx` means `train/dev/test`. Here we assume users have already downloaded AISHELL-1 dataset. If not, users can download data [here](https://www.openslr.org/33/) and set the path for `$data_aishell`. The examples of `wav.scp` and `text` are as follows:
+* `wav.scp`
+```
+BAC009S0002W0122 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav
+BAC009S0002W0123 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0123.wav
+BAC009S0002W0124 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0124.wav
+...
+```
+* `text`
+```
+BAC009S0002W0122 而 对 楼 市 成 交 抑 制 作 用 最 大 的 限 购
+BAC009S0002W0123 也 成 为 地 方 政 府 的 眼 中 钉
+BAC009S0002W0124 自 六 月 底 呼 和 浩 特 市 率 先 宣 布 取 消 限 购 后
+...
+```
+These two files both have two columns, while the first column is wav ids and the second column is the corresponding wav paths/label tokens.
+
+## Stage 1: Feature Generation
+This stage extracts FBank features from `wav.scp` and apply speed perturbation as data augmentation according to `speed_perturb`. Users can set `nj` to control the number of jobs for feature generation. The generated features are saved in `$feats_dir/dump/xxx/ark` and the corresponding `feats.scp` files are saved as `$feats_dir/dump/xxx/feats.scp`. An example of `feats.scp` can be seen as follows:
+* `feats.scp`
+```
+...
+BAC009S0002W0122_sp0.9 /nfs/funasr_data/aishell-1/dump/fbank/train/ark/feats.16.ark:592751055
+...
+```
+Note that samples in this file have already been shuffled randomly. This file contains two columns. The first column is wav ids while the second column is kaldi-ark feature paths. Besides, `speech_shape` and `text_shape` are also generated in this stage, denoting the speech feature shape and text length of each sample. The examples are shown as follows:
+* `speech_shape`
+```
+...
+BAC009S0002W0122_sp0.9 665,80
+...
+```
+* `text_shape`
+```
+...
+BAC009S0002W0122_sp0.9 15
+...
+```
+These two files have two columns. The first column is wav ids and the second column is the corresponding speech feature shape and text length.
+
+## Stage 2: Dictionary Preparation
+This stage processes the dictionary, which is used as a mapping between label characters and integer indices during ASR training. The processed dictionary file is saved as `$feats_dir/data/$lang_toekn_list/$token_type/tokens.txt`. An example of `tokens.txt` is as follows:
+* `tokens.txt`
+```
+
+
+
+一
+丁
+...
+龚
+龟
+
+```
+* ``: indicates the blank token for CTC
+* ``: indicates the start-of-sentence token
+* ``: indicates the end-of-sentence token
+* ``: indicates the out-of-vocabulary token
+
+## Stage 3: Training
+This stage achieves the training of the specified model. To start training, users should manually set `exp_dir`, `CUDA_VISIBLE_DEVICES` and `gpu_num`, which have already been explained above. By default, the best `$keep_nbest_models` checkpoints on validation dataset will be averaged to generate a better model and adopted for decoding.
+
+* DDP Training
+
+We support the DistributedDataParallel (DDP) training and the detail can be found [here](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html). To enable DDP training, please set `gpu_num` greater than 1. For example, if you set `CUDA_VISIBLE_DEVICES=0,1,5,6,7` and `gpu_num=3`, then the gpus with ids 0, 1 and 5 will be used for training.
+
+* DataLoader
+
+We support an optional iterable-style DataLoader based on [Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) for large dataset and users can set `dataset_type=large` to enable it.
+
+* Configuration
+
+The parameters of the training, including model, optimization, dataset, etc., can be set by a YAML file in `conf` directory. Also, users can directly set the parameters in `run.sh` recipe. Please avoid to set the same parameters in both the YAML file and the recipe.
+
+* Training Steps
+
+We support two parameters to specify the training steps, namely `max_epoch` and `max_update`. `max_epoch` indicates the total training epochs while `max_update` indicates the total training steps. If these two parameters are specified at the same time, once the training reaches any one of these two parameters, the training will be stopped.
+
+* Tensorboard
+
+Users can use tensorboard to observe the loss, learning rate, etc. Please run the following command:
+```
+tensorboard --logdir ${exp_dir}/exp/${model_dir}/tensorboard/train
+```
+
+## Stage 4: Decoding
+This stage generates the recognition results and calculates the `CER` to verify the performance of the trained model.
+
+* Mode Selection
+
+As we support paraformer, uniasr, conformer and other models in FunASR, a `mode` parameter should be specified as `asr/paraformer/uniasr` according to the trained model.
+
+* Configuration
+
+We support CTC decoding, attention decoding and hybrid CTC-attention decoding in FunASR, which can be specified by `ctc_weight` in a YAML file in `conf` directory. Specifically, `ctc_weight=1.0` indicates CTC decoding, `ctc_weight=0.0` indicates attention decoding, `0.0
-
-
-一
-丁
-...
-龚
-龟
-
-```
-* ``: 表示CTC训练中的blank
-* ``: 表示句子的起始符
-* ``: 表示句子的终止符
-* ``: 表示字典外的字符
-
-## 阶段 3:训练
-本阶段对应模型的训练。在开始训练之前,需要指定实验结果保存目录`exp_dir`,训练可用GPU`CUDA_VISIBLE_DEVICES`和训练的gpu数量`gpu_num`。默认情况下,最好的`$keep_nbest_models`模型结果会被平均从而来获取更好的性能。
-
-* DDP Training
-
-我们提供了分布式训练(DDP)功能,具体的细节可以在[这里](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) 找到。为了开启分布式训练,需要设置`gpu_num`大于1。例如,设置`CUDA_VISIBLE_DEVICES=0,1,5,6,7`,`gpu_num=3`,则编号为0,1和5的GPU会被用于训练。
-
-* DataLoader
-
-我们提供了基于[Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) 实现的大数据DataLoader,用户可以通过设置`dataset_type=large`来启用。
-
-* Configuration
-
-训练相关的参数,包括模型,优化器,数据等,均可以通过`conf`目录下的config文件指定。同时,用户也可以直接在`run.sh`脚本中指定相关参数。请避免在config文件和`run.sh`脚本中设置相同的参数,以免造成歧义。
-
-* Training Steps
-
-我们提供了两种方式来控制训练的总步数,对应的参数分别为`max_epoch`和`max_update`。`max_epoch`表示训练的最大epoch数,`max_update`表示训练的最大迭代次数。如果这两个参数同时被指定,则一旦训练步数到达其中任意一个参数,训练结束。
-
-* Tensorboard
-
-用户可以通过tensorboard来观察训练过程中的损失,学习率等。可以通过下述指定来实现:
-```
-tensorboard --logdir ${exp_dir}/exp/${model_dir}/tensorboard/train
-```
-
-## 阶段 4: 解码
-本阶段用于解码得到识别结果,同时计算CER来验证训练得到的模型性能。
-
-* Mode Selection
-
-由于我们提供了paraformer,uniasr和conformer等模型,因此在解码时,需要指定相应的解码模式。对应的参数为`mode`,相应的可选设置为`asr/paraformer/uniasr`等。
-
-* Configuration
-
-我们提供了ctc解码, attention解码和ctc-attention混合解码。这几种解码方式可以通过`conf`下的解码配置文件中的`ctc_weight`参数来指定。具体的,`ctc_weight=1.0`表示CTC解码, `ctc_weight=0.0`表示attention解码, `0.0`_ 上发布工业级语音识别模型以及支持相关的训练和微调,研究者和开发者们可以更方便地进行语音识别模型的研究和生产,促进语音识别生态的发展。ASR for Fun!
-
-.. toctree::
- :maxdepth: 1
- :caption: 教程:
-
- ./installation.md
- ./papers.md
- ./get_started.md
- ./build_task.md
-
-.. toctree::
- :maxdepth: 1
- :caption: ModelScope:
-
- ./modelscope_models.md
- ./modelscope_usages.md
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
diff --git a/docs_cn/installation.md b/docs_cn/installation.md
deleted file mode 100755
index a31bc0140..000000000
--- a/docs_cn/installation.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# 安装
-FunASR的安装十分便捷,下面将给出详细的安装步骤:
-
-- 安装Conda并创建虚拟环境
-``` sh
-wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
-sh Miniconda3-latest-Linux-x86_64.sh
-source ~/.bashrc
-conda create -n funasr python=3.7
-conda activate funasr
-```
-
-- 安装Pytorch (版本 >= 1.7.0):
-
-```sh
-pip install torch torchaudio
-```
-
-关于更多的版本, 请参照 [https://pytorch.org/get-started/locally](https://pytorch.org/get-started/locally)
-
-- 安装 ModelScope
-
-对于国内用户,可以通过配置下述镜像源来加快下载速度
-```sh
-pip config set global.index-url https://mirror.sjtu.edu.cn/pypi/web/simple
-```
-
-安装或更新ModelScope
-``` sh
-pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-```
-
-- 下载FunASR仓库,并安装剩余所需依赖
-``` sh
-git clone https://github.com/alibaba/FunASR.git && cd FunASR
-pip install --editable ./
-```
\ No newline at end of file
diff --git a/docs_cn/make.bat b/docs_cn/make.bat
deleted file mode 100644
index 747ffb7b3..000000000
--- a/docs_cn/make.bat
+++ /dev/null
@@ -1,35 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
- set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=source
-set BUILDDIR=build
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
- echo.
- echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
- echo.installed, then set the SPHINXBUILD environment variable to point
- echo.to the full path of the 'sphinx-build' executable. Alternatively you
- echo.may add the Sphinx directory to PATH.
- echo.
- echo.If you don't have Sphinx installed, grab it from
- echo.https://www.sphinx-doc.org/
- exit /b 1
-)
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
diff --git a/docs_cn/modelscope_models.md b/docs_cn/modelscope_models.md
deleted file mode 100644
index 8501c1ffa..000000000
--- a/docs_cn/modelscope_models.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# ModelScope上的预训练模型
-
-## 模型许可证
-- Apache License 2.0
-
-## 模型库
-这里我们提供了一些基于不同数据集训练得到的几种预训练模型,所有的预训练模型和更多细节可以参见 [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition) 。
-
-| Datasets | Hours | Model | Online/Offline | Language | Framework | Checkpoint |
-|:-----:|:-----:|:--------------:|:--------------:| :---: | :---: | --- |
-| Alibaba Speech Data | 60000 | Paraformer | Offline | CN | Pytorch |[speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) |
-| Alibaba Speech Data | 50000 | Paraformer | Offline | CN | Tensorflow |[speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary) |
-| Alibaba Speech Data | 50000 | Paraformer | Offline | CN | Tensorflow |[speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary) |
-| Alibaba Speech Data | 50000 | Paraformer | Online | CN | Tensorflow |[speech_paraformer_asr_nat-zh-cn-16k-common-vocab3444-tensorflow1-online](http://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab3444-tensorflow1-online/summary) |
-| Alibaba Speech Data | 50000 | UniASR | Online | CN | Tensorflow |[speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/summary) |
-| Alibaba Speech Data | 50000 | UniASR | Offline | CN | Tensorflow |[speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline](https://www.modelscope.cn/models/damo/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/summary) |
-| Alibaba Speech Data | 50000 | UniASR | Online | CN&EN | Tensorflow |[speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-online/summary) |
-| Alibaba Speech Data | 50000 | UniASR | Offline | CN&EN | Tensorflow |[speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-offline/summary) |
-| Alibaba Speech Data | 20000 | UniASR | Online | CN-Accent | Tensorflow |[speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online/summary) |
-| Alibaba Speech Data | 20000 | UniASR | Offline | CN-Accent | Tensorflow |[speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/summary) |
-| Alibaba Speech Data | 30000 | Paraformer-8K | Online | CN | Tensorflow |[speech_paraformer_asr_nat-zh-cn-8k-common-vocab3444-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab3444-tensorflow1-online/summary) |
-| Alibaba Speech Data | 30000 | Paraformer-8K | Offline | CN | Tensorflow |[speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/summary) |
-| Alibaba Speech Data | 30000 | Paraformer-8K | Online | CN | Pytorch |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary) |
-| Alibaba Speech Data | 30000 | Paraformer-8K | Offline | CN | Pytorch |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/summary) |
-| Alibaba Speech Data | 30000 | UniASR-8K | Online | CN | Tensorflow |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online/summary) |
-| Alibaba Speech Data | 30000 | UniASR-8K | Offline | CN | Tensorflow |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/summary) |
-| Alibaba Speech Data | 30000 | UniASR-8K | Online | CN | Pytorch |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary) |
-| Alibaba Speech Data | 30000 | UniASR-8K | Offline | CN | Pytorch |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/summary) |
-| AISHELL-1 | 178 | Paraformer | Offline | CN | Pytorch | [speech_paraformer_asr_nat-aishell1-pytorch](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-aishell1-pytorch/summary) |
-| AISHELL-2 | 1000 | Paraformer | Offline | CN | Pytorch | [speech_paraformer_asr_nat-aishell2-pytorch](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-aishell2-pytorch/summary) |
-| AISHELL-1 | 178 | ParaformerBert | Offline | CN | Pytorch | [speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch](https://modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary) |
-| AISHELL-2 | 1000 | ParaformerBert | Offline | CN | Pytorch | [speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch](https://modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary) |
-| AISHELL-1 | 178 | Conformer | Offline | CN | Pytorch | [speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch](https://modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary) |
-| AISHELL-2 | 1000 | Conformer | Offline | CN | Pytorch | [speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch](https://modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary) |
diff --git a/docs_cn/modelscope_usages.md b/docs_cn/modelscope_usages.md
deleted file mode 100644
index c91de76cc..000000000
--- a/docs_cn/modelscope_usages.md
+++ /dev/null
@@ -1,52 +0,0 @@
-# ModelScope 使用说明
-ModelScope是阿里巴巴推出的开源模型即服务共享平台,为广大学术界用户和工业界用户提供灵活、便捷的模型应用支持。具体的使用方法和开源模型可以参见[ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition) 。在语音方向,我们提供了自回归/非自回归语音识别,语音预训练,标点预测等模型,用户可以方便使用。
-
-## 整体介绍
-我们在`egs_modelscope` 目录下提供了不同模型的使用方法,支持直接用我们提供的模型进行推理,同时也支持将我们提供的模型作为预训练好的初始模型进行微调。下面,我们将以`egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch`目录中提供的模型来进行介绍,包括`infer.py`,`finetune.py`和`infer_after_finetune.py`,对应的功能如下:
-- `infer.py`: 基于我们提供的模型,对指定的数据集进行推理
-- `finetune.py`: 将我们提供的模型作为初始模型进行微调
-- `infer_after_finetune.py`: 基于微调得到的模型,对指定的数据集进行推理
-
-## 模型推理
-我们提供了`infer.py`来实现模型推理。基于此文件,用户可以基于我们提供的模型,对指定的数据集进行推理,得到相应的识别结果。如果给定了抄本,则会同时计算`CER`。在开始推理前,用户可以指定如下参数来修改推理配置:
-* `data_dir`:数据集目录。目录下应该包括音频列表文件`wav.scp`和抄本文件`text`(可选),具体格式可以参见[快速开始](./get_started.md)中的说明。如果`text`文件存在,则会相应的计算CER,否则会跳过。
-* `output_dir`:推理结果保存目录
-* `batch_size`:推理时的batch大小
-* `ctc_weight`:部分模型包含CTC模块,可以设置该参数来指定推理时,CTC模块的权重
-
-除了直接在`infer.py`中设置参数外,用户也可以通过手动修改模型下载目录下的`decoding.yaml`文件中的参数来修改推理配置。
-
-## 模型微调
-我们提供了`finetune.py`来实现模型微调。基于此文件,用户可以基于我们提供的模型作为初始模型,在指定的数据集上进行微调,从而在特征领域取得更好的性能。在微调开始前,用户可以指定如下参数来修改微调配置:
-* `data_path`:数据目录。该目录下应该包括存放训练集数据的`train`目录和存放验证集数据的`dev`目录。每个目录中需要包括音频列表文件`wav.scp`和抄本文件`text`
-* `output_dir`:微调结果保存目录
-* `dataset_type`:对于小数据集,设置为`small`;当数据量大于1000小时时,设置为`large`
-* `batch_bins`:batch size,如果dataset_type设置为`small`,batch_bins单位为fbank特征帧数;如果dataset_type设置为`large`,batch_bins单位为毫秒
-* `max_epoch`:最大的训练轮数
-
-以下参数也可以进行设置。但是如果没有特别的需求,可以忽略,直接使用我们给定的默认值:
-* `accum_grad`:梯度累积
-* `keep_nbest_models`:选择性能最好的`keep_nbest_models`个模型的参数进行平均,得到性能更好的模型
-* `optim`:设置优化器
-* `lr`:设置学习率
-* `scheduler`:设置学习率调整策略
-* `scheduler_conf`:学习率调整策略的相关参数
-* `specaug`:设置谱增广
-* `specaug_conf`:谱增广的相关参数
-
-除了直接在`finetune.py`中设置参数外,用户也可以通过手动修改模型下载目录下的`finetune.yaml`文件中的参数来修改微调配置。
-
-## 基于微调后的模型推理
-我们提供了`infer_after_finetune.py`来实现基于用户自己微调得到的模型进行推理。基于此文件,用户可以基于微调后的模型,对指定的数据集进行推理,得到相应的识别结果。如果给定了抄本,则会同时计算CER。在开始推理前,用户可以指定如下参数来修改推理配置:
-* `data_dir`:数据集目录。目录下应该包括音频列表文件`wav.scp`和抄本文件`text`(可选)。如果`text`文件存在,则会相应的计算CER,否则会跳过。
-* `output_dir`:推理结果保存目录
-* `batch_size`:推理时的batch大小
-* `ctc_weight`:部分模型包含CTC模块,可以设置该参数来指定推理时,CTC模块的权重
-* `decoding_model_name`:指定用于推理的模型名
-
-以下参数也可以进行设置。但是如果没有特别的需求,可以忽略,直接使用我们给定的默认值:
-* `modelscope_model_name`:微调时使用的初始模型名
-* `required_files`:使用modelscope接口进行推理时需要用到的文件
-
-## 注意事项
-部分模型可能在微调、推理时存在一些特有的参数,这部分参数可以在对应目录的`README.md`文件中找到具体用法。
\ No newline at end of file
diff --git a/docs_cn/papers.md b/docs_cn/papers.md
deleted file mode 100644
index 34a815033..000000000
--- a/docs_cn/papers.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# 论文
-
-- [Universal ASR: Unifying Streaming and Non-Streaming ASR Using a Single Encoder-Decoder Model](https://arxiv.org/abs/2010.14099), arXiv preprint arXiv:2010.14099, 2020.
-- [Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition](https://arxiv.org/abs/2206.08317), INTERSPEECH 2022.
\ No newline at end of file
diff --git a/egs_modelscope/asr/TEMPLATE/finetune.py b/egs_modelscope/asr/TEMPLATE/finetune.py
new file mode 100644
index 000000000..1935258b6
--- /dev/null
+++ b/egs_modelscope/asr/TEMPLATE/finetune.py
@@ -0,0 +1,36 @@
+import os
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+
+from funasr.datasets.ms_dataset import MsDataset
+from funasr.utils.modelscope_param import modelscope_args
+
+
+def modelscope_finetune(params):
+ if not os.path.exists(params.output_dir):
+ os.makedirs(params.output_dir, exist_ok=True)
+ # dataset split ["train", "validation"]
+ ds_dict = MsDataset.load(params.data_path)
+ kwargs = dict(
+ model=params.model,
+ data_dir=ds_dict,
+ dataset_type=params.dataset_type,
+ work_dir=params.output_dir,
+ batch_bins=params.batch_bins,
+ max_epoch=params.max_epoch,
+ lr=params.lr)
+ trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+ trainer.train()
+
+
+if __name__ == '__main__':
+ params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", data_path="./data")
+ params.output_dir = "./checkpoint" # m模型保存路径
+ params.data_path = "./example_data/" # 数据路径
+ params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large
+ params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒,
+ params.max_epoch = 50 # 最大训练轮数
+ params.lr = 0.00005 # 设置学习率
+
+ modelscope_finetune(params)
diff --git a/egs_modelscope/asr/TEMPLATE/infer.py b/egs_modelscope/asr/TEMPLATE/infer.py
new file mode 100644
index 000000000..9f280d50b
--- /dev/null
+++ b/egs_modelscope/asr/TEMPLATE/infer.py
@@ -0,0 +1,25 @@
+import os
+import shutil
+import argparse
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+def modelscope_infer(args):
+ os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
+ inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model=args.model,
+ output_dir=args.output_dir,
+ batch_size=args.batch_size,
+ )
+ inference_pipeline(audio_in=args.audio_in)
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+ parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
+ parser.add_argument('--output_dir', type=str, default="./results/")
+ parser.add_argument('--batch_size', type=int, default=64)
+ parser.add_argument('--gpuid', type=str, default="0")
+ args = parser.parse_args()
+ modelscope_infer(args)
\ No newline at end of file
diff --git a/egs_modelscope/asr/TEMPLATE/infer.sh b/egs_modelscope/asr/TEMPLATE/infer.sh
new file mode 100644
index 000000000..b8b011c0a
--- /dev/null
+++ b/egs_modelscope/asr/TEMPLATE/infer.sh
@@ -0,0 +1,96 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=2
+model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+data_dir="./data/test"
+output_dir="./results"
+batch_size=64
+gpu_inference=true # whether to perform gpu decoding
+gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1"
+njob=4 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+
+. utils/parse_options.sh || exit 1;
+
+if ${gpu_inference} == "true"; then
+ nj=$(echo $gpuid_list | awk -F "," '{print NF}')
+else
+ nj=$njob
+ batch_size=1
+ gpuid_list=""
+ for JOB in $(seq ${nj}); do
+ gpuid_list=$gpuid_list"-1,"
+ done
+fi
+
+mkdir -p $output_dir/split
+split_scps=""
+for JOB in $(seq ${nj}); do
+ split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
+done
+perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
+ echo "Decoding ..."
+ gpuid_list_array=(${gpuid_list//,/ })
+ for JOB in $(seq ${nj}); do
+ {
+ id=$((JOB-1))
+ gpuid=${gpuid_list_array[$id]}
+ mkdir -p ${output_dir}/output.$JOB
+ python infer.py \
+ --model ${model} \
+ --audio_in ${output_dir}/split/wav.$JOB.scp \
+ --output_dir ${output_dir}/output.$JOB \
+ --batch_size ${batch_size} \
+ --gpuid ${gpuid}
+ }&
+ done
+ wait
+
+ mkdir -p ${output_dir}/1best_recog
+ for f in token score text; do
+ if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
+ for i in $(seq "${nj}"); do
+ cat "${output_dir}/output.${i}/1best_recog/${f}"
+ done | sort -k1 >"${output_dir}/1best_recog/${f}"
+ fi
+ done
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
+ echo "Computing WER ..."
+ cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
+ cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
+ python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
+ tail -n 3 ${output_dir}/1best_recog/text.cer
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
+ echo "SpeechIO TIOBE textnorm"
+ echo "$0 --> Normalizing REF text ..."
+ ./utils/textnorm_zh.py \
+ --has_key --to_upper \
+ ${data_dir}/text \
+ ${output_dir}/1best_recog/ref.txt
+
+ echo "$0 --> Normalizing HYP text ..."
+ ./utils/textnorm_zh.py \
+ --has_key --to_upper \
+ ${output_dir}/1best_recog/text.proc \
+ ${output_dir}/1best_recog/rec.txt
+ grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt
+
+ echo "$0 --> computing WER/CER and alignment ..."
+ ./utils/error_rate_zh \
+ --tokenizer char \
+ --ref ${output_dir}/1best_recog/ref.txt \
+ --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
+ ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
+ rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
+fi
+
diff --git a/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py b/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py
new file mode 100644
index 000000000..2d311ddc6
--- /dev/null
+++ b/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py
@@ -0,0 +1,48 @@
+import json
+import os
+import shutil
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
+
+from funasr.utils.compute_wer import compute_wer
+
+def modelscope_infer_after_finetune(params):
+ # prepare for decoding
+
+ try:
+ pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+ except BaseException:
+ raise BaseException(f"Please download pretrain model from ModelScope firstly.")
+ shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
+ decoding_path = os.path.join(params["output_dir"], "decode_results")
+ if os.path.exists(decoding_path):
+ shutil.rmtree(decoding_path)
+ os.mkdir(decoding_path)
+
+ # decoding
+ inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model=pretrained_model_path,
+ output_dir=decoding_path,
+ batch_size=params["batch_size"]
+ )
+ audio_in = os.path.join(params["data_dir"], "wav.scp")
+ inference_pipeline(audio_in=audio_in)
+
+ # computer CER if GT text is set
+ text_in = os.path.join(params["data_dir"], "text")
+ if os.path.exists(text_in):
+ text_proc_file = os.path.join(decoding_path, "1best_recog/text")
+ compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
+
+
+if __name__ == '__main__':
+ params = {}
+ params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+ params["output_dir"] = "./checkpoint"
+ params["data_dir"] = "./data/test"
+ params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+ params["batch_size"] = 64
+ modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/TEMPLATE/utils b/egs_modelscope/asr/TEMPLATE/utils
new file mode 120000
index 000000000..dc7d4171f
--- /dev/null
+++ b/egs_modelscope/asr/TEMPLATE/utils
@@ -0,0 +1 @@
+../../../egs/aishell/transformer/utils
\ No newline at end of file
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py
index b3bfe8e24..8abadd719 100755
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py
@@ -7,7 +7,6 @@ from modelscope.utils.constant import Tasks
from funasr.utils.compute_wer import compute_wer
-import pdb;
def modelscope_infer_core(output_dir, split_dir, njob, idx):
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
gpu_id = (int(idx) - 1) // njob
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
index 79cc3c3bf..c740f7187 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -23,21 +23,37 @@ Or you can use the finetuned model for inference directly.
- Setting parameters in `infer.sh`
- model: # model name on ModelScope
- - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
+ - data_dir: # the dataset dir needs to include `${data_dir}/wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
- output_dir: # result dir
- batch_size: # batchsize of inference
- gpu_inference: # whether to perform gpu decoding, set false for cpu decoding
- gpuid_list: # set gpus, e.g., gpuid_list="0,1"
- njob: # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
-- Then you can run the pipeline to infer with:
-```python
- sh infer.sh
+- Decode with multi GPUs:
+```shell
+ bash infer.sh \
+ --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+ --data_dir "./data/test" \
+ --output_dir "./results" \
+ --batch_size 64 \
+ --gpu_inference true \
+ --gpuid_list "0,1"
+```
+
+- Decode with multi-thread CPUs:
+```shell
+ bash infer.sh \
+ --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+ --data_dir "./data/test" \
+ --output_dir "./results" \
+ --gpu_inference false \
+ --njob 64
```
- Results
-The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
+The decoding results can be found in `${output_dir}/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
index 221479d99..b8b011c0a 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
@@ -14,8 +14,9 @@ gpu_inference=true # whether to perform gpu decoding
gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1"
njob=4 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+. utils/parse_options.sh || exit 1;
-if ${gpu_inference}; then
+if ${gpu_inference} == "true"; then
nj=$(echo $gpuid_list | awk -F "," '{print NF}')
else
nj=$njob
diff --git a/funasr/bin/asr_inference.py b/funasr/bin/asr_inference.py
index f3b4d560a..47226021f 100644
--- a/funasr/bin/asr_inference.py
+++ b/funasr/bin/asr_inference.py
@@ -346,6 +346,8 @@ def inference_modelscope(
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if word_lm_train_config is not None:
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index 2b6716ed8..e10ebf404 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -1,9 +1,4 @@
#!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-
-import torch
-torch.set_num_threads(1)
import argparse
import logging
diff --git a/funasr/bin/asr_inference_mfcca.py b/funasr/bin/asr_inference_mfcca.py
index 6f3dbb113..e83286958 100644
--- a/funasr/bin/asr_inference_mfcca.py
+++ b/funasr/bin/asr_inference_mfcca.py
@@ -472,6 +472,8 @@ def inference_modelscope(
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if word_lm_train_config is not None:
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index 8cbd41905..5546c92ae 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -612,7 +612,9 @@ def inference_modelscope(
**kwargs,
):
assert check_argument_types()
-
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
if ngpu > 1:
@@ -629,7 +631,9 @@ def inference_modelscope(
export_mode = param_dict.get("export_mode", False)
else:
hotword_list_or_file = None
-
+
+ if kwargs.get("device", None) == "cpu":
+ ngpu = 0
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py
index 944685f1d..821f69429 100644
--- a/funasr/bin/asr_inference_paraformer_streaming.py
+++ b/funasr/bin/asr_inference_paraformer_streaming.py
@@ -536,6 +536,8 @@ def inference_modelscope(
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
diff --git a/funasr/bin/asr_inference_paraformer_vad.py b/funasr/bin/asr_inference_paraformer_vad.py
index 1548f9ff1..977dc9bb3 100644
--- a/funasr/bin/asr_inference_paraformer_vad.py
+++ b/funasr/bin/asr_inference_paraformer_vad.py
@@ -157,6 +157,8 @@ def inference_modelscope(
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py
index 9dc0b79ce..197930f47 100644
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -484,6 +484,8 @@ def inference_modelscope(
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py
index 4aea72079..35ecdc24b 100644
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@@ -379,6 +379,8 @@ def inference_modelscope(
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if word_lm_train_config is not None:
diff --git a/funasr/bin/diar_inference_launch.py b/funasr/bin/diar_inference_launch.py
index 83436e8a7..07974c072 100755
--- a/funasr/bin/diar_inference_launch.py
+++ b/funasr/bin/diar_inference_launch.py
@@ -2,8 +2,6 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
-import torch
-torch.set_num_threads(1)
import argparse
import logging
diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index 01d3f296a..87816dd22 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -158,6 +158,8 @@ def inference_modelscope(
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
diff --git a/funasr/bin/lm_inference.py b/funasr/bin/lm_inference.py
index 15c56caef..76de6df7a 100644
--- a/funasr/bin/lm_inference.py
+++ b/funasr/bin/lm_inference.py
@@ -89,10 +89,9 @@ def inference_modelscope(
**kwargs,
):
assert check_argument_types()
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
diff --git a/funasr/bin/lm_inference_launch.py b/funasr/bin/lm_inference_launch.py
index d229cc6c1..dc6414f6a 100644
--- a/funasr/bin/lm_inference_launch.py
+++ b/funasr/bin/lm_inference_launch.py
@@ -1,9 +1,6 @@
#!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-import torch
-torch.set_num_threads(1)
+
import argparse
import logging
diff --git a/funasr/bin/punc_inference_launch.py b/funasr/bin/punc_inference_launch.py
index 2c5a2865f..b1d923553 100755
--- a/funasr/bin/punc_inference_launch.py
+++ b/funasr/bin/punc_inference_launch.py
@@ -1,9 +1,5 @@
#!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-import torch
-torch.set_num_threads(1)
import argparse
import logging
diff --git a/funasr/bin/punctuation_infer_vadrealtime.py b/funasr/bin/punctuation_infer_vadrealtime.py
index 5157eeb29..b2db1bf17 100644
--- a/funasr/bin/punctuation_infer_vadrealtime.py
+++ b/funasr/bin/punctuation_infer_vadrealtime.py
@@ -203,10 +203,8 @@ def inference_modelscope(
**kwargs,
):
assert check_argument_types()
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
diff --git a/funasr/bin/sond_inference.py b/funasr/bin/sond_inference.py
index 5a0a8e28f..c55bc3544 100755
--- a/funasr/bin/sond_inference.py
+++ b/funasr/bin/sond_inference.py
@@ -252,6 +252,8 @@ def inference_modelscope(
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
diff --git a/funasr/bin/sv_inference.py b/funasr/bin/sv_inference.py
index 7e63bbd2d..76b1dfbb8 100755
--- a/funasr/bin/sv_inference.py
+++ b/funasr/bin/sv_inference.py
@@ -179,6 +179,9 @@ def inference_modelscope(
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
diff --git a/funasr/bin/sv_inference_launch.py b/funasr/bin/sv_inference_launch.py
index 64a3cff2e..880607013 100755
--- a/funasr/bin/sv_inference_launch.py
+++ b/funasr/bin/sv_inference_launch.py
@@ -2,8 +2,6 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
-import torch
-torch.set_num_threads(1)
import argparse
import logging
diff --git a/funasr/bin/tp_inference.py b/funasr/bin/tp_inference.py
index 6360b17db..df029fdcc 100644
--- a/funasr/bin/tp_inference.py
+++ b/funasr/bin/tp_inference.py
@@ -54,7 +54,7 @@ class SpeechText2Timestamp:
assert check_argument_types()
# 1. Build ASR model
tp_model, tp_train_args = ASRTask.build_model_from_file(
- timestamp_infer_config, timestamp_model_file, device
+ timestamp_infer_config, timestamp_model_file, device=device
)
if 'cuda' in device:
tp_model = tp_model.cuda() # force model to cuda
@@ -179,6 +179,9 @@ def inference_modelscope(
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
diff --git a/funasr/bin/tp_inference_launch.py b/funasr/bin/tp_inference_launch.py
index 55debac6d..6cdff057d 100644
--- a/funasr/bin/tp_inference_launch.py
+++ b/funasr/bin/tp_inference_launch.py
@@ -1,9 +1,5 @@
#!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-import torch
-torch.set_num_threads(1)
import argparse
import logging
diff --git a/funasr/bin/vad_inference.py b/funasr/bin/vad_inference.py
index 08d65a4e7..aff0a443b 100644
--- a/funasr/bin/vad_inference.py
+++ b/funasr/bin/vad_inference.py
@@ -192,6 +192,9 @@ def inference_modelscope(
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
diff --git a/funasr/bin/vad_inference_launch.py b/funasr/bin/vad_inference_launch.py
index 8fea8db16..4a1f334cf 100644
--- a/funasr/bin/vad_inference_launch.py
+++ b/funasr/bin/vad_inference_launch.py
@@ -1,9 +1,4 @@
#!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-
-import torch
-torch.set_num_threads(1)
import argparse
import logging
diff --git a/funasr/bin/vad_inference_online.py b/funasr/bin/vad_inference_online.py
index 9ed072199..4d026207d 100644
--- a/funasr/bin/vad_inference_online.py
+++ b/funasr/bin/vad_inference_online.py
@@ -151,6 +151,9 @@ def inference_modelscope(
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
diff --git a/funasr/models/joint_net/__init__.py b/funasr/models/joint_net/__init__.py
new file mode 100644
index 000000000..8b1378917
--- /dev/null
+++ b/funasr/models/joint_net/__init__.py
@@ -0,0 +1 @@
+
diff --git a/funasr/runtime/python/benchmark_libtorch.md b/funasr/runtime/python/benchmark_libtorch.md
index 6c068fef3..52927b190 100644
--- a/funasr/runtime/python/benchmark_libtorch.md
+++ b/funasr/runtime/python/benchmark_libtorch.md
@@ -1,27 +1,32 @@
-# Benchmark
+# CPU Benchmark (Libtorch)
+## Configuration
### Data set:
Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 36108.919 seconds.
### Tools
-- Install ModelScope and FunASR
+#### Install Requirements
+Install ModelScope and FunASR
+```shell
+pip install -U modelscope funasr
+# For the users in China, you could install with the command:
+#pip install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
- ```shell
- pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
- git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
- pip install --editable ./
- cd funasr/runtime/python/utils
- pip install -r requirements.txt
- ```
+Install requirements
+```shell
+git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
+cd funasr/runtime/python/utils
+pip install -r requirements.txt
+```
-- recipe
+#### Recipe
- set the model, data path and output_dir
-
- ```shell
- nohup bash test_rtf.sh &> log.txt &
- ```
+set the model, data path and output_dir
+```shell
+nohup bash test_rtf.sh &> log.txt &
+```
## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
diff --git a/funasr/runtime/python/benchmark_onnx.md b/funasr/runtime/python/benchmark_onnx.md
index 533798a8a..9f920942f 100644
--- a/funasr/runtime/python/benchmark_onnx.md
+++ b/funasr/runtime/python/benchmark_onnx.md
@@ -1,26 +1,32 @@
-# Benchmark
+# CPU Benchmark (ONNX)
+## Configuration
### Data set:
Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 36108.919 seconds.
### Tools
-- Install ModelScope and FunASR
+#### Install Requirements
+Install ModelScope and FunASR
+```shell
+pip install -U modelscope funasr
+# For the users in China, you could install with the command:
+#pip install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
- ```shell
- pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
- git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
- pip install --editable ./
- cd funasr/runtime/python/utils
- pip install -r requirements.txt
- ```
+Install requirements
+```shell
+git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
+cd funasr/runtime/python/utils
+pip install -r requirements.txt
+```
-- recipe
+#### Recipe
- set the model, data path and output_dir
+set the model, data path and output_dir
- ```shell
- nohup bash test_rtf.sh &> log.txt &
- ```
+```shell
+nohup bash test_rtf.sh &> log.txt &
+```
## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
diff --git a/funasr/runtime/python/onnxruntime/README.md b/funasr/runtime/python/onnxruntime/README.md
index e85e08aac..3f4e76236 100644
--- a/funasr/runtime/python/onnxruntime/README.md
+++ b/funasr/runtime/python/onnxruntime/README.md
@@ -35,22 +35,114 @@ pip install -e ./
# pip install -e ./ -i https://mirror.sjtu.edu.cn/pypi/web/simple
```
-## Run the demo
-- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`.
+## Inference with runtime
+
+### Speech Recognition
+#### Paraformer
+ ```python
+ from funasr_onnx import Paraformer
+
+ model_dir = "./export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+ model = Paraformer(model_dir, batch_size=1)
+
+ wav_path = ['./export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+ result = model(wav_path)
+ print(result)
+ ```
+- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
- Input: wav formt file, support formats: `str, np.ndarray, List[str]`
-- Output: `List[str]`: recognition result.
-- Example:
- ```python
- from funasr_onnx import Paraformer
+- Output: `List[str]`: recognition result
- model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
- model = Paraformer(model_dir, batch_size=1)
+#### Paraformer-online
- wav_path = ['/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+### Voice Activity Detection
+#### FSMN-VAD
+```python
+from funasr_onnx import Fsmn_vad
- result = model(wav_path)
- print(result)
- ```
+model_dir = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+wav_path = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav"
+model = Fsmn_vad(model_dir)
+
+result = model(wav_path)
+print(result)
+```
+- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
+- Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+- Output: `List[str]`: recognition result
+
+#### FSMN-VAD-online
+```python
+from funasr_onnx import Fsmn_vad_online
+import soundfile
+
+
+model_dir = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+wav_path = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav"
+model = Fsmn_vad_online(model_dir)
+
+
+##online vad
+speech, sample_rate = soundfile.read(wav_path)
+speech_length = speech.shape[0]
+#
+sample_offset = 0
+step = 1600
+param_dict = {'in_cache': []}
+for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
+ if sample_offset + step >= speech_length - 1:
+ step = speech_length - sample_offset
+ is_final = True
+ else:
+ is_final = False
+ param_dict['is_final'] = is_final
+ segments_result = model(audio_in=speech[sample_offset: sample_offset + step],
+ param_dict=param_dict)
+ if segments_result:
+ print(segments_result)
+```
+- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
+- Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+- Output: `List[str]`: recognition result
+
+### Punctuation Restoration
+#### CT-Transformer
+```python
+from funasr_onnx import CT_Transformer
+
+model_dir = "./export/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+model = CT_Transformer(model_dir)
+
+text_in="跨境河流是养育沿岸人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切愿意进一步完善双方联合工作机制凡是中方能做的我们都会去做而且会做得更好我请印度朋友们放心中国在上游的任何开发利用都会经过科学规划和论证兼顾上下游的利益"
+result = model(text_in)
+print(result[0])
+```
+- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
+- Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+- Output: `List[str]`: recognition result
+
+#### CT-Transformer-online
+```python
+from funasr_onnx import CT_Transformer_VadRealtime
+
+model_dir = "./export/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
+model = CT_Transformer_VadRealtime(model_dir)
+
+text_in = "跨境河流是养育沿岸|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员|在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险|向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流>问题上的关切|愿意进一步完善双方联合工作机制|凡是|中方能做的我们|都会去做而且会做得更好我请印度朋友们放心中国在上游的|任何开发利用都会经过科学|规划和论证兼顾上下游的利益"
+
+vads = text_in.split("|")
+rec_result_all=""
+param_dict = {"cache": []}
+for vad in vads:
+ result = model(vad, param_dict=param_dict)
+ rec_result_all += result[0]
+
+print(rec_result_all)
+```
+- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
+- Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+- Output: `List[str]`: recognition result
## Performance benchmark
diff --git a/funasr/train/trainer.py b/funasr/train/trainer.py
index 9574a0dad..9c4af414f 100644
--- a/funasr/train/trainer.py
+++ b/funasr/train/trainer.py
@@ -186,9 +186,6 @@ class Trainer:
logging.warning("No keep_nbest_models is given. Change to [1]")
trainer_options.keep_nbest_models = [1]
keep_nbest_models = trainer_options.keep_nbest_models
-
- #assert batch_interval is set and >0
- assert trainer_options.batch_interval > 0
output_dir = Path(trainer_options.output_dir)
reporter = Reporter()