docs

2025-09-15 14:48:36 +08:00 · 2023-04-27 19:10:35 +08:00 · 2023-04-27 19:10:35 +08:00 · ad128cbe0c
commit ad128cbe0c
parent 56508c42af
58 changed files with 599 additions and 872 deletions
--- a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
@ -1,30 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained Paraformer-large Model
-
-### Finetune
-
- Modify finetune training related parameters in `finetune.py`
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
-    - <strong>batch_bins:</strong> # batch size
-    - <strong>max_epoch:</strong> # number of training epoch
-    - <strong>lr:</strong> # learning rate
-
- Then you can run the pipeline to finetune with:
-```python
-    python finetune.py
-```
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-
- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
--- a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
@ -0,0 +1 @@
+../../TEMPLATE/README.md
--- a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
@ -0,0 +1,14 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == '__main__':
+    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
+    output_dir = None
+    inference_pipline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch",
+        output_dir=output_dir,
+    )
+    rec_result = inference_pipline(audio_in=audio_in)
+    print(rec_result)
+
--- a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
@ -1,14 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == '__main__':
-    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
-    output_dir = None
-    inference_pipline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model="damo/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch",
-        output_dir=output_dir,
-    )
-    rec_result = inference_pipline(audio_in=audio_in)
-    print(rec_result)
-
--- a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
@ -0,0 +1 @@
+../../TEMPLATE/infer.py
--- a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh
@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
--- a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md
@ -0,0 +1 @@
+../../TEMPLATE/README.md
--- a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py
@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+    audio_in = "https://modelscope.oss-cn-beijing.aliyuncs.com/test/audios/asr_example.wav"
+    output_dir = "./results"
+    inference_pipline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch",
+        output_dir=output_dir,
+    )
+    rec_result = inference_pipline(audio_in=audio_in)
+    print(rec_result)
--- a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
@ -1,13 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == "__main__":
-    audio_in = "https://modelscope.oss-cn-beijing.aliyuncs.com/test/audios/asr_example.wav"
-    output_dir = "./results"
-    inference_pipline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model="damo/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch",
-        output_dir=output_dir,
-    )
-    rec_result = inference_pipline(audio_in=audio_in)
-    print(rec_result)
--- a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
@ -0,0 +1 @@
+../../TEMPLATE/infer.py
--- a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh
@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/demo.py
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/demo.py
@ -0,0 +1,11 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950',
+    model_revision='v3.0.0'
+)
+
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
@ -1,67 +0,0 @@
-import json
-import os
-import shutil
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-from funasr.utils.compute_wer import compute_wer
-
-
-def modelscope_infer_after_finetune(params):
-    # prepare for decoding
-    pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
-    for file_name in params["required_files"]:
-        if file_name == "configuration.json":
-            with open(os.path.join(pretrained_model_path, file_name)) as f:
-                config_dict = json.load(f)
-                config_dict["model"]["am_model_name"] = params["decoding_model_name"]
-            with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
-                json.dump(config_dict, f, indent=4, separators=(',', ': '))
-        else:
-            shutil.copy(os.path.join(pretrained_model_path, file_name),
-                        os.path.join(params["output_dir"], file_name))
-    decoding_path = os.path.join(params["output_dir"], "decode_results")
-    if os.path.exists(decoding_path):
-        shutil.rmtree(decoding_path)
-    os.mkdir(decoding_path)
-
-    # decoding
-    inference_pipeline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model=params["output_dir"],
-        output_dir=decoding_path,
-        batch_size=1
-    )
-    audio_in = os.path.join(params["data_dir"], "wav.scp")
-    inference_pipeline(audio_in=audio_in)
-
-    # computer CER if GT text is set
-    text_in = os.path.join(params["data_dir"], "text")
-    if text_in is not None:
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
-        text_proc_file2 = os.path.join(decoding_path, "1best_recog/token_nosep")
-        with open(text_proc_file, 'r') as hyp_reader:
-                with open(text_proc_file2, 'w') as hyp_writer:
-                    for line in hyp_reader:
-                        new_context = line.strip().replace("src","").replace("  "," ").replace("  "," ").strip()
-                        hyp_writer.write(new_context+'\n')
-        text_in2 = os.path.join(decoding_path, "1best_recog/ref_text_nosep")
-        with open(text_in, 'r') as ref_reader:
-            with open(text_in2, 'w') as ref_writer:
-                for line in ref_reader:
-                    new_context = line.strip().replace("src","").replace("  "," ").replace("  "," ").strip()
-                    ref_writer.write(new_context+'\n')
-
-
-        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.sp.cer"))
-        compute_wer(text_in2, text_proc_file2, os.path.join(decoding_path, "text.nosp.cer"))
-
-if __name__ == '__main__':
-    params = {}
-    params["modelscope_model_name"] = "NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950"
-    params["required_files"] = ["feats_stats.npz", "decoding.yaml", "configuration.json"]
-    params["output_dir"] = "./checkpoint"
-    params["data_dir"] = "./example_data/validation"
-    params["decoding_model_name"] = "valid.acc.ave.pb"
-    modelscope_infer_after_finetune(params)
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
@ -1,19 +0,0 @@
-# ModelScope Model
-
-## How to infer using a pretrained Paraformer-large Model
-
-### Inference
-
-You can use the pretrain model for inference directly.
-
- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # Support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-    - <strong>batch_size:</strong> # Set batch size in inference.
-    - <strong>param_dict:</strong> # Set the hotword list in inference.
-
- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
-
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
@ -0,0 +1 @@
+../TEMPLATE/README.md
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh
@ -1,105 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-set -u
-set -o pipefail
-
-stage=1
-stop_stage=2
-model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"
-data_dir="./data/test"
-output_dir="./results"
-batch_size=64
-gpu_inference=true    # whether to perform gpu decoding
-gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
-njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
-checkpoint_dir=
-checkpoint_name="valid.cer_ctc.ave.pb"
-hotword_txt=None
-
-. utils/parse_options.sh || exit 1;
-
-if ${gpu_inference} == "true"; then
-    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
-else
-    nj=$njob
-    batch_size=1
-    gpuid_list=""
-    for JOB in $(seq ${nj}); do
-        gpuid_list=$gpuid_list"-1,"
-    done
-fi
-
-mkdir -p $output_dir/split
-split_scps=""
-for JOB in $(seq ${nj}); do
-    split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
-done
-perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
-
-if [ -n "${checkpoint_dir}" ]; then
-  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
-  model=${checkpoint_dir}/${model}
-fi
-
-if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
-    echo "Decoding ..."
-    gpuid_list_array=(${gpuid_list//,/ })
-    for JOB in $(seq ${nj}); do
-        {
-        id=$((JOB-1))
-        gpuid=${gpuid_list_array[$id]}
-        mkdir -p ${output_dir}/output.$JOB
-        python infer.py \
-            --model ${model} \
-            --audio_in ${output_dir}/split/wav.$JOB.scp \
-            --output_dir ${output_dir}/output.$JOB \
-            --batch_size ${batch_size} \
-            --gpuid ${gpuid} \
-            --hotword_txt ${hotword_txt}
-        }&
-    done
-    wait
-
-    mkdir -p ${output_dir}/1best_recog
-    for f in token score text; do
-        if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
-          for i in $(seq "${nj}"); do
-              cat "${output_dir}/output.${i}/1best_recog/${f}"
-          done | sort -k1 >"${output_dir}/1best_recog/${f}"
-        fi
-    done
-fi
-
-if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
-    echo "Computing WER ..."
-    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
-    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
-    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
-    tail -n 3 ${output_dir}/1best_recog/text.cer
-fi
-
-if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
-    echo "SpeechIO TIOBE textnorm"
-    echo "$0 --> Normalizing REF text ..."
-    ./utils/textnorm_zh.py \
-        --has_key --to_upper \
-        ${data_dir}/text \
-        ${output_dir}/1best_recog/ref.txt
-
-    echo "$0 --> Normalizing HYP text ..."
-    ./utils/textnorm_zh.py \
-        --has_key --to_upper \
-        ${output_dir}/1best_recog/text.proc \
-        ${output_dir}/1best_recog/rec.txt
-    grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt
-
-    echo "$0 --> computing WER/CER and alignment ..."
-    ./utils/error_rate_zh \
-        --tokenizer char \
-        --ref ${output_dir}/1best_recog/ref.txt \
-        --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
-        ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
-    rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
-fi
-
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh
@ -0,0 +1 @@
+../TEMPLATE/infer.sh
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py
@ -1,36 +0,0 @@
-import os
-import tempfile
-import codecs
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.msdatasets import MsDataset
-
-if __name__ == '__main__':
-    param_dict = dict()
-    param_dict['hotword'] = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/hotword.txt"
-
-    output_dir = "./output"
-    batch_size = 1
-
-    # dataset split ['test']
-    ds_dict = MsDataset.load(dataset_name='speech_asr_aishell1_hotwords_testsets', namespace='speech_asr')
-    work_dir = tempfile.TemporaryDirectory().name
-    if not os.path.exists(work_dir):
-        os.makedirs(work_dir)
-    wav_file_path = os.path.join(work_dir, "wav.scp")
-    
-    with codecs.open(wav_file_path, 'w') as fin: 
-        for line in ds_dict:
-            wav = line["Audio:FILE"]
-            idx = wav.split("/")[-1].split(".")[0]
-            fin.writelines(idx + " " + wav + "\n")
-    audio_in = wav_file_path         
-
-    inference_pipeline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
-        output_dir=output_dir,
-        batch_size=batch_size,
-        param_dict=param_dict)
-
-    rec_result = inference_pipeline(audio_in=audio_in)
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@ -1,76 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained Paraformer-large Model
-
-### Finetune
-
- Modify finetune training related parameters in `finetune.py`
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
-    - <strong>dataset_type:</strong> # for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
-    - <strong>batch_bins:</strong> # batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms
-    - <strong>max_epoch:</strong> # number of training epoch
-    - <strong>lr:</strong> # learning rate
-
- Then you can run the pipeline to finetune with:
-```python
-    python finetune.py
-```
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
- Setting parameters in `infer.sh`
-    - <strong>model:</strong> # model name on ModelScope
-    - <strong>data_dir:</strong> # the dataset dir needs to include `${data_dir}/wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
-    - <strong>output_dir:</strong> # result dir
-    - <strong>batch_size:</strong> # batchsize of inference
-    - <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding
-    - <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1"
-    - <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
-
- Decode with multi GPUs:
-```shell
-    bash infer.sh \
-    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
-    --data_dir "./data/test" \
-    --output_dir "./results" \
-    --batch_size 64 \
-    --gpu_inference true \
-    --gpuid_list "0,1"
-```
-
- Decode with multi-thread CPUs:
-```shell
-    bash infer.sh \
-    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
-    --data_dir "./data/test" \
-    --output_dir "./results" \
-    --gpu_inference false \
-    --njob 64
-```
-
- Results
-
-The decoding results can be found in `${output_dir}/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
-
-If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
-
-### Inference using local finetuned model
-
- Modify inference related parameters in `infer_after_finetune.py`
-    - <strong>modelscope_model_name: </strong> # model name on ModelScope
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
-    - <strong>batch_size:</strong> # batchsize of inference  
-
- Then you can run the pipeline to finetune with:
-```python
-    python infer_after_finetune.py
-```
-
- Results
-
-The decoding results can be found in `$output_dir/decoding_results/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@ -0,0 +1 @@
+../TEMPLATE/README.md
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
@ -1,103 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-set -u
-set -o pipefail
-
-stage=1
-stop_stage=2
-model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-data_dir="./data/test"
-output_dir="./results"
-batch_size=64
-gpu_inference=true    # whether to perform gpu decoding
-gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
-njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
-checkpoint_dir=
-checkpoint_name="valid.cer_ctc.ave.pb"
-
-. utils/parse_options.sh || exit 1;
-
-if ${gpu_inference} == "true"; then
-    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
-else
-    nj=$njob
-    batch_size=1
-    gpuid_list=""
-    for JOB in $(seq ${nj}); do
-        gpuid_list=$gpuid_list"-1,"
-    done
-fi
-
-mkdir -p $output_dir/split
-split_scps=""
-for JOB in $(seq ${nj}); do
-    split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
-done
-perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
-
-if [ -n "${checkpoint_dir}" ]; then
-  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
-  model=${checkpoint_dir}/${model}
-fi
-
-if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
-    echo "Decoding ..."
-    gpuid_list_array=(${gpuid_list//,/ })
-    for JOB in $(seq ${nj}); do
-        {
-        id=$((JOB-1))
-        gpuid=${gpuid_list_array[$id]}
-        mkdir -p ${output_dir}/output.$JOB
-        python infer.py \
-            --model ${model} \
-            --audio_in ${output_dir}/split/wav.$JOB.scp \
-            --output_dir ${output_dir}/output.$JOB \
-            --batch_size ${batch_size} \
-            --gpuid ${gpuid}
-        }&
-    done
-    wait
-
-    mkdir -p ${output_dir}/1best_recog
-    for f in token score text; do
-        if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
-          for i in $(seq "${nj}"); do
-              cat "${output_dir}/output.${i}/1best_recog/${f}"
-          done | sort -k1 >"${output_dir}/1best_recog/${f}"
-        fi
-    done
-fi
-
-if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
-    echo "Computing WER ..."
-    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
-    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
-    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
-    tail -n 3 ${output_dir}/1best_recog/text.cer
-fi
-
-if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
-    echo "SpeechIO TIOBE textnorm"
-    echo "$0 --> Normalizing REF text ..."
-    ./utils/textnorm_zh.py \
-        --has_key --to_upper \
-        ${data_dir}/text \
-        ${output_dir}/1best_recog/ref.txt
-
-    echo "$0 --> Normalizing HYP text ..."
-    ./utils/textnorm_zh.py \
-        --has_key --to_upper \
-        ${output_dir}/1best_recog/text.proc \
-        ${output_dir}/1best_recog/rec.txt
-    grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt
-
-    echo "$0 --> computing WER/CER and alignment ..."
-    ./utils/error_rate_zh \
-        --tokenizer char \
-        --ref ${output_dir}/1best_recog/ref.txt \
-        --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
-        ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
-    rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
-fi
-
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
@ -0,0 +1 @@
+../TEMPLATE/infer.sh
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
@ -1,48 +0,0 @@
-import json
-import os
-import shutil
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.hub.snapshot_download import snapshot_download
-
-from funasr.utils.compute_wer import compute_wer
-
-def modelscope_infer_after_finetune(params):
-    # prepare for decoding
-
-    try:
-        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
-    except BaseException:
-        raise BaseException(f"Please download pretrain model from ModelScope firstly.")
-    shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
-    decoding_path = os.path.join(params["output_dir"], "decode_results")
-    if os.path.exists(decoding_path):
-        shutil.rmtree(decoding_path)
-    os.mkdir(decoding_path)
-
-    # decoding
-    inference_pipeline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model=pretrained_model_path,
-        output_dir=decoding_path,
-        batch_size=params["batch_size"]
-    )
-    audio_in = os.path.join(params["data_dir"], "wav.scp")
-    inference_pipeline(audio_in=audio_in)
-
-    # computer CER if GT text is set
-    text_in = os.path.join(params["data_dir"], "text")
-    if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
-        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
-
-
-if __name__ == '__main__':
-    params = {}
-    params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-    params["output_dir"] = "./checkpoint"
-    params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
-    params["batch_size"] = 64
-    modelscope_infer_after_finetune(params)
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
@ -1,30 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained Paraformer-large Model
-
-### Finetune
-
- Modify finetune training related parameters in `finetune.py`
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
-    - <strong>batch_bins:</strong> # batch size
-    - <strong>max_epoch:</strong> # number of training epoch
-    - <strong>lr:</strong> # learning rate
-
- Then you can run the pipeline to finetune with:
-```python
-    python finetune.py
-```
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-
- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
@ -0,0 +1 @@
+../TEMPLATE/README.md
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
@ -0,0 +1,15 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == '__main__':
+    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
+    output_dir = None
+    inference_pipline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch",
+        output_dir=output_dir,
+        batch_size=1,
+    )
+    rec_result = inference_pipline(audio_in=audio_in)
+    print(rec_result)
+
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
@ -1,15 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == '__main__':
-    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
-    output_dir = None
-    inference_pipline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model="damo/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch",
-        output_dir=output_dir,
-        batch_size=32,
-    )
-    rec_result = inference_pipline(audio_in=audio_in)
-    print(rec_result)
-
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
@ -0,0 +1 @@
+../TEMPLATE/infer.py
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh
@ -0,0 +1 @@
+../TEMPLATE/infer.sh
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md
@ -0,0 +1 @@
+../TEMPLATE/README.md
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py
@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+    audio_in = "https://modelscope.oss-cn-beijing.aliyuncs.com/test/audios/asr_example.wav"
+    output_dir = "./results"
+    inference_pipline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch",
+        output_dir=output_dir,
+    )
+    rec_result = inference_pipline(audio_in=audio_in)
+    print(rec_result)
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
@ -1,13 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == "__main__":
-    audio_in = "https://modelscope.oss-cn-beijing.aliyuncs.com/test/audios/asr_example.wav"
-    output_dir = "./results"
-    inference_pipline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model="damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch",
-        output_dir=output_dir,
-    )
-    rec_result = inference_pipline(audio_in=audio_in)
-    print(rec_result)
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
@ -0,0 +1 @@
+../TEMPLATE/infer.py
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh
@ -0,0 +1 @@
+../TEMPLATE/infer.sh
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@ -1,46 +1,246 @@
-# ModelScope Model
+# Speech Recognition

-## How to finetune and infer using a pretrained Paraformer-large Model
+> **Note**: 
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take the typic models as examples to demonstrate the usage.

-### Finetune
+## Inference

- Modify finetune training related parameters in `finetune.py`
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
-    - <strong>batch_bins:</strong> # batch size
-    - <strong>max_epoch:</strong> # number of training epoch
-    - <strong>lr:</strong> # learning rate
-
- Then you can run the pipeline to finetune with:
+### Quick start
+#### [Paraformer Model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
 ```python
-    python finetune.py
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
+)
+
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
+```
+#### [Paraformer-online Model](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/summary)
+```python
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
+    )
+import soundfile
+speech, sample_rate = soundfile.read("example/asr_example.wav")
+
+param_dict = {"cache": dict(), "is_final": False}
+chunk_stride = 7680# 480ms
+# first chunk, 480ms
+speech_chunk = speech[0:chunk_stride] 
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+# next chunk, 480ms
+speech_chunk = speech[chunk_stride:chunk_stride+chunk_stride]
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+```
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/241)
+
+#### [UniASR Model](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
+There are three decoding mode for UniASR model(`fast`、`normal`、`offline`), for more model detailes, please refer to [docs](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
+```python
+decoding_model = "fast" # "fast"、"normal"、"offline"
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825',
+    param_dict={"decoding_model": decoding_model})
+
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
+```
+The decoding mode of `fast` and `normal` is fake streaming, which could be used for evaluating of recognition accuracy.
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/151)
+#### [RNN-T-online model]()
+Undo
+
+#### [MFCCA Model](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary)
+For more model detailes, please refer to [docs](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary)
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950',
+    model_revision='v3.0.0'
+)
+
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
 ```

-### Inference
+#### API-reference
+##### Define pipeline
+- `task`: `Tasks.auto_speech_recognition`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
+- `ncpu`: `1` (Default), sets the number of threads used for intraop parallelism on CPU 
+- `output_dir`: `None` (Default), the output path of results if set
+- `batch_size`: `1` (Default), batch size when decoding
+##### Infer pipeline
+- `audio_in`: the input to decode, which could be: 
+  - wav_path, `e.g.`: asr_example.wav,
+  - pcm_path, `e.g.`: asr_example.pcm, 
+  - audio bytes stream, `e.g.`: bytes data from a microphone
+  - audio sample point，`e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, the dtype is numpy.ndarray or torch.Tensor
+  - wav.scp, kaldi style wav list (`wav_id \t wav_path`), `e.g.`: 
+  ```text
+  asr_example1  ./audios/asr_example1.wav
+  asr_example2  ./audios/asr_example2.wav
+  ```
+  In this case of `wav.scp` input, `output_dir` must be set to save the output results
+- `audio_fs`: audio sampling rate, only set when audio_in is pcm audio
+- `output_dir`: None (Default), the output path of results if set

-Or you can use the finetuned model for inference directly.
+### Inference with multi-thread CPUs or multi GPUs
+FunASR also offer recipes [egs_modelscope/asr/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.

- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
+- Setting parameters in `infer.sh`
+    - `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+    - `data_dir`: the dataset dir needs to include `wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
+    - `output_dir`: output dir of the recognition results
+    - `batch_size`: `64` (Default), batch size of inference on gpu
+    - `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference
+    - `gpuid_list`: `0,1` (Default), which gpu_ids are used to infer
+    - `njob`: only used for CPU inference (`gpu_inference`=`false`), `64` (Default), the number of jobs for CPU decoding
+    - `checkpoint_dir`: only used for infer finetuned models, the path dir of finetuned models
+    - `checkpoint_name`: only used for infer finetuned models, `valid.cer_ctc.ave.pb` (Default), which checkpoint is used to infer
+    - `decoding_mode`: `normal` (Default), decoding mode for UniASR model(fast、normal、offline)
+    - `hotword_txt`: `None` (Default), hotword file for contextual paraformer model(the hotword file name ends with .txt")

- Then you can run the pipeline to infer with:
-```python
-    python infer.py
+- Decode with multi GPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --batch_size 64 \
+    --gpu_inference true \
+    --gpuid_list "0,1"
 ```
-
-### Inference using local finetuned model
-
- Modify inference related parameters in `infer_after_finetune.py`
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
-
- Then you can run the pipeline to finetune with:
-```python
-    python infer_after_finetune.py
+- Decode with multi-thread CPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --gpu_inference false \
+    --njob 64
 ```

 - Results

-The decoding results can be found in `$output_dir/decoding_results/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
+The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
+
+If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
+
+
+## Finetune with pipeline
+
+### Quick start
+[finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py)
+```python
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from modelscope.msdatasets.audio.asr_dataset import ASRDataset
+
+def modelscope_finetune(params):
+    if not os.path.exists(params.output_dir):
+        os.makedirs(params.output_dir, exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = ASRDataset.load(params.data_path, namespace='speech_asr')
+    kwargs = dict(
+        model=params.model,
+        data_dir=ds_dict,
+        dataset_type=params.dataset_type,
+        work_dir=params.output_dir,
+        batch_bins=params.batch_bins,
+        max_epoch=params.max_epoch,
+        lr=params.lr)
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    from funasr.utils.modelscope_param import modelscope_args
+    params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+    params.output_dir = "./checkpoint"                      # 模型保存路径
+    params.data_path = "speech_asr_aishell1_trainsets"      # 数据路径，可以为modelscope中已上传数据，也可以是本地数据
+    params.dataset_type = "small"                           # 小数据量设置small，若数据量大于1000小时，请使用large
+    params.batch_bins = 2000                                # batch size，如果dataset_type="small"，batch_bins单位为fbank特征帧数，如果dataset_type="large"，batch_bins单位为毫秒，
+    params.max_epoch = 50                                   # 最大训练轮数
+    params.lr = 0.00005                                     # 设置学习率
+    
+    modelscope_finetune(params)
+```
+
+```shell
+python finetune.py &> log.txt &
+```
+
+### Finetune with your data
+
+- Modify finetune training related parameters in [finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py)
+    - `output_dir`: result dir
+    - `data_dir`: the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
+    - `dataset_type`: for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
+    - `batch_bins`: batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms
+    - `max_epoch`: number of training epoch
+    - `lr`: learning rate
+
+- Training data formats：
+```sh
+cat ./example_data/text
+BAC009S0002W0122 而 对 楼 市 成 交 抑 制 作 用 最 大 的 限 购
+BAC009S0002W0123 也 成 为 地 方 政 府 的 眼 中 钉
+english_example_1 hello world
+english_example_2 go swim 去 游 泳
+
+cat ./example_data/wav.scp
+BAC009S0002W0122 /mnt/data/wav/train/S0002/BAC009S0002W0122.wav
+BAC009S0002W0123 /mnt/data/wav/train/S0002/BAC009S0002W0123.wav
+english_example_1 /mnt/data/wav/train/S0002/english_example_1.wav
+english_example_2 /mnt/data/wav/train/S0002/english_example_2.wav
+```
+
+- Then you can run the pipeline to finetune with:
+```shell
+python finetune.py
+```
+If you want finetune with multi-GPUs, you could:
+```shell
+CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch --nproc_per_node 2 finetune.py > log.txt 2>&1
+```
+## Inference with your finetuned model
+
+- Setting parameters in [egs_modelscope/asr/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) is the same with [docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/egs_modelscope/asr/TEMPLATE#inference-with-multi-thread-cpus-or-multi-gpus), `model` is the model name from modelscope, which you finetuned.
+
+- Decode with multi GPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --batch_size 64 \
+    --gpu_inference true \
+    --gpuid_list "0,1" \
+    --checkpoint_dir "./checkpoint" \
+    --checkpoint_name "valid.cer_ctc.ave.pb"
+```
+- Decode with multi-thread CPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --gpu_inference false \
+    --njob 64 \
+    --checkpoint_dir "./checkpoint" \
+    --checkpoint_name "valid.cer_ctc.ave.pb"
+```
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
@ -0,0 +1,15 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == '__main__':
+    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
+    output_dir = None
+    inference_pipeline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
+        vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
+        punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
+    )
+    rec_result = inference_pipeline(audio_in=audio_in)
+    print(rec_result)
+
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
@ -1,16 +1,28 @@
+import os
+import shutil
+import argparse
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks

-if __name__ == '__main__':
-    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
-    output_dir = None
+def modelscope_infer(args):
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
    inference_pipeline = pipeline(
        task=Tasks.auto_speech_recognition,
-        model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
-        vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
-        punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
-        ngpu=1,
+        model=args.model,
+        output_dir=args.output_dir,
+        batch_size=args.batch_size,
+        param_dict={"decoding_model": args.decoding_mode, "hotword": args.hotword_txt}
    )
-    rec_result = inference_pipeline(audio_in=audio_in)
-    print(rec_result)
+    inference_pipeline(audio_in=args.audio_in)

+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+    parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
+    parser.add_argument('--output_dir', type=str, default="./results/")
+    parser.add_argument('--decoding_mode', type=str, default="normal")
+    parser.add_argument('--hotword_txt', type=str, default=None)
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--gpuid', type=str, default="0")
+    args = parser.parse_args()
+    modelscope_infer(args)
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=2
+model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+data_dir="./data/test"
+output_dir="./results"
+batch_size=64
+gpu_inference=true    # whether to perform gpu decoding
+gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
+njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+checkpoint_dir=
+checkpoint_name="valid.cer_ctc.ave.pb"
+
+. utils/parse_options.sh || exit 1;
+
+if ${gpu_inference} == "true"; then
+    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
+else
+    nj=$njob
+    batch_size=1
+    gpuid_list=""
+    for JOB in $(seq ${nj}); do
+        gpuid_list=$gpuid_list"-1,"
+    done
+fi
+
+mkdir -p $output_dir/split
+split_scps=""
+for JOB in $(seq ${nj}); do
+    split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
+done
+perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
+
+if [ -n "${checkpoint_dir}" ]; then
+  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
+  model=${checkpoint_dir}/${model}
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
+    echo "Decoding ..."
+    gpuid_list_array=(${gpuid_list//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+        mkdir -p ${output_dir}/output.$JOB
+        python infer.py \
+            --model ${model} \
+            --audio_in ${output_dir}/split/wav.$JOB.scp \
+            --output_dir ${output_dir}/output.$JOB \
+            --batch_size ${batch_size} \
+            --gpuid ${gpuid}
+        }&
+    done
+    wait
+
+    mkdir -p ${output_dir}/1best_recog
+    for f in token score text; do
+        if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
+          for i in $(seq "${nj}"); do
+              cat "${output_dir}/output.${i}/1best_recog/${f}"
+          done | sort -k1 >"${output_dir}/1best_recog/${f}"
+        fi
+    done
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
+    echo "Computing WER ..."
+    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
+    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
+    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
+    tail -n 3 ${output_dir}/1best_recog/text.cer
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
+    echo "SpeechIO TIOBE textnorm"
+    echo "$0 --> Normalizing REF text ..."
+    ./utils/textnorm_zh.py \
+        --has_key --to_upper \
+        ${data_dir}/text \
+        ${output_dir}/1best_recog/ref.txt
+
+    echo "$0 --> Normalizing HYP text ..."
+    ./utils/textnorm_zh.py \
+        --has_key --to_upper \
+        ${output_dir}/1best_recog/text.proc \
+        ${output_dir}/1best_recog/rec.txt
+    grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt
+
+    echo "$0 --> computing WER/CER and alignment ..."
+    ./utils/error_rate_zh \
+        --tokenizer char \
+        --ref ${output_dir}/1best_recog/ref.txt \
+        --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
+        ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
+    rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
+fi
+
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
@ -1,47 +0,0 @@
-import json
-import os
-import shutil
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.hub.snapshot_download import snapshot_download
-
-from funasr.utils.compute_wer import compute_wer
-
-def modelscope_infer_after_finetune(params):
-    # prepare for decoding
-
-    try:
-        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
-    except BaseException:
-        raise BaseException(f"Please download pretrain model from ModelScope firstly.")shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
-    decoding_path = os.path.join(params["output_dir"], "decode_results")
-    if os.path.exists(decoding_path):
-        shutil.rmtree(decoding_path)
-    os.mkdir(decoding_path)
-
-    # decoding
-    inference_pipeline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model=pretrained_model_path,
-        output_dir=decoding_path,
-        batch_size=params["batch_size"]
-    )
-    audio_in = os.path.join(params["data_dir"], "wav.scp")
-    inference_pipeline(audio_in=audio_in)
-
-    # computer CER if GT text is set
-    text_in = os.path.join(params["data_dir"], "text")
-    if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
-        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
-
-
-if __name__ == '__main__':
-    params = {}
-    params["modelscope_model_name"] = "damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-    params["output_dir"] = "./checkpoint"
-    params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
-    params["batch_size"] = 64
-    modelscope_infer_after_finetune(params)
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/utils
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/utils
@ -0,0 +1 @@
+../../asr/TEMPLATE/utils
--- a/egs_modelscope/punctuation/TEMPLATE/README.md
+++ b/egs_modelscope/punctuation/TEMPLATE/README.md
@ -1,5 +1,4 @@
 # Punctuation Restoration
-# Voice Activity Detection

 > **Note**: 
 > The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetune. Here we take the model of the punctuation model of CT-Transformer as example to demonstrate the usage.
@ -69,7 +68,7 @@ Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/
 - `param_dict`: reserving the cache which is necessary in realtime mode. 

 ### Inference with multi-thread CPUs or multi GPUs
-FunASR also offer recipes [egs_modelscope/punc/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/punc/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs. It is an offline recipe and only support offline model.
+FunASR also offer recipes [egs_modelscope/punctuation/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/punctuation/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs. It is an offline recipe and only support offline model.

 - Setting parameters in `infer.sh`
    - `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
@ -87,7 +86,7 @@ FunASR also offer recipes [egs_modelscope/punc/TEMPLATE/infer.sh](https://github
    --model "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" \
    --data_dir "./data/test" \
    --output_dir "./results" \
-    --batch_size 64 \
+    --batch_size 1 \
    --gpu_inference true \
    --gpuid_list "0,1"
 ```
@ -98,7 +97,7 @@ FunASR also offer recipes [egs_modelscope/punc/TEMPLATE/infer.sh](https://github
    --data_dir "./data/test" \
    --output_dir "./results" \
    --gpu_inference false \
-    --njob 64
+    --njob 1
 ```


--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/README.md
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/README.md
@ -0,0 +1 @@
+../../TEMPLATE/README.md
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/demo.py
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/demo.py
@ -0,0 +1,27 @@
+
+##################text二进制数据#####################
+inputs = "跨境河流是养育沿岸|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员|在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险|向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切|愿意进一步完善双方联合工作机制|凡是|中方能做的我们|都会去做而且会做得更好我请印度朋友们放心中国在上游的|任何开发利用都会经过科学|规划和论证兼顾上下游的利益"
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+import logging
+logger = get_logger(log_level=logging.CRITICAL)
+logger.setLevel(logging.CRITICAL)
+
+
+inference_pipeline = pipeline(
+    task=Tasks.punctuation,
+    model='damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727',
+    output_dir="./tmp/"
+)
+
+vads = inputs.split("|")
+rec_result_all="outputs:"
+param_dict = {"cache": []}
+for vad in vads:
+    rec_result = inference_pipeline(text_in=vad, param_dict=param_dict)
+    rec_result_all += rec_result['text']
+
+print(rec_result_all)
+
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
@ -1,27 +0,0 @@
-
-##################text二进制数据#####################
-inputs = "跨境河流是养育沿岸|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员|在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险|向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切|愿意进一步完善双方联合工作机制|凡是|中方能做的我们|都会去做而且会做得更好我请印度朋友们放心中国在上游的|任何开发利用都会经过科学|规划和论证兼顾上下游的利益"
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.utils.logger import get_logger
-import logging
-logger = get_logger(log_level=logging.CRITICAL)
-logger.setLevel(logging.CRITICAL)
-
-
-inference_pipeline = pipeline(
-    task=Tasks.punctuation,
-    model='damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727',
-    output_dir="./tmp/"
-)
-
-vads = inputs.split("|")
-rec_result_all="outputs:"
-param_dict = {"cache": []}
-for vad in vads:
-    rec_result = inference_pipeline(text_in=vad, param_dict=param_dict)
-    rec_result_all += rec_result['text']
-
-print(rec_result_all)
-
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
@ -0,0 +1 @@
+../../TEMPLATE/infer.py
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.sh
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.sh
@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md
@ -1,19 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained ModelScope Model
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-task=Tasks.punctuation,
-    model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
-
- Setting parameters in `modelscope_common_infer.sh`
-    - <strong>model:</strong> damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch  # pre-trained model, download from modelscope
-    - <strong>text_in:</strong> input path, text or url
-    - <strong>output_dir:</strong> the result dir
- Then you can run the pipeline to infer with: 
-```sh
-    python ./infer.py
-```
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md
@ -0,0 +1 @@
+../TEMPLATE/README.md
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt
@ -1,3 +0,0 @@
-1	跨境河流是养育沿岸人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切愿意进一步完善双方联合工作机制凡是中方能做的我们都会去做而且会做得更好我请印度朋友们放心中国在上游的任何开发利用都会经过科学规划和论证兼顾上下游的利益
-2	从存储上来说仅仅是全景图片它就会是图片的四倍的容量然后全景的视频会是普通视频八倍的这个存储的容要求而三d的模型会是图片的十倍这都对我们今天运行在的云计算的平台存储的平台提出了更高的要求
-3	那今天的会就到这里吧 happy new year 明年见
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py
@ -0,0 +1,23 @@
+
+##################text.scp文件路径###################
+inputs = "./egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt"
+
+##################text二进制数据#####################
+#inputs = "我们都是木头人不会讲话不会动"
+
+##################text文件url#######################
+#inputs = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt"
+
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipline = pipeline(
+    task=Tasks.punctuation,
+    model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
+    model_revision="v1.1.7",
+    output_dir="./tmp/"
+)
+
+rec_result = inference_pipline(text_in=inputs)
+print(rec_result)
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py
@ -1,23 +0,0 @@
-
-##################text.scp文件路径###################
-inputs = "./egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt"
-
-##################text二进制数据#####################
-#inputs = "我们都是木头人不会讲话不会动"
-
-##################text文件url#######################
-#inputs = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt"
-
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-inference_pipline = pipeline(
-    task=Tasks.punctuation,
-    model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
-    model_revision="v1.1.7",
-    output_dir="./tmp/"
-)
-
-rec_result = inference_pipline(text_in=inputs)
-print(rec_result)
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py
@ -0,0 +1 @@
+../TEMPLATE/infer.py
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.sh
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.sh
@ -0,0 +1 @@
+../TEMPLATE/infer.sh
--- a/egs_modelscope/tp/TEMPLATE/README.md
+++ b/egs_modelscope/tp/TEMPLATE/README.md
@ -59,11 +59,11 @@ Timestamp pipeline can also be used after ASR pipeline to compose complete ASR f
    ```

 ### Inference with multi-thread CPUs or multi GPUs
-FunASR also offer recipes [egs_modelscope/vad/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/vad/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
+FunASR also offer recipes [egs_modelscope/tp/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/tp/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.

 - Setting parameters in `infer.sh`
    - `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
-    - `data_dir`: the dataset dir **must** include `wav.scp` and `text.scp`
+    - `data_dir`: the dataset dir **must** include `wav.scp` and `text.txt`
    - `output_dir`: output dir of the recognition results
    - `batch_size`: `64` (Default), batch size of inference on gpu
    - `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference
@ -78,7 +78,7 @@ FunASR also offer recipes [egs_modelscope/vad/TEMPLATE/infer.sh](https://github.
    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
    --data_dir "./data/test" \
    --output_dir "./results" \
-    --batch_size 64 \
+    --batch_size 1 \
    --gpu_inference true \
    --gpuid_list "0,1"
 ```
@ -89,7 +89,7 @@ FunASR also offer recipes [egs_modelscope/vad/TEMPLATE/infer.sh](https://github.
    --data_dir "./data/test" \
    --output_dir "./results" \
    --gpu_inference false \
-    --njob 64
+    --njob 1
 ```

 ## Finetune with pipeline
--- a/egs_modelscope/tp/TEMPLATE/infer.py
+++ b/egs_modelscope/tp/TEMPLATE/infer.py
@ -1 +0,0 @@
-../speech_timestamp_prediction-v1-16k-offline/infer.py
--- a/egs_modelscope/tp/TEMPLATE/infer.py
+++ b/egs_modelscope/tp/TEMPLATE/infer.py
@ -0,0 +1,28 @@
+import os
+import argparse
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+def modelscope_infer(args):
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
+    inference_pipeline = pipeline(
+        task=Tasks.speech_timestamp,
+        model=args.model,
+        output_dir=args.output_dir,
+        batch_size=args.batch_size,
+    )
+    if args.output_dir is not None:
+        inference_pipeline(audio_in=args.audio_in, text_in=args.text_in)
+    else:
+        print(inference_pipeline(audio_in=args.audio_in, text_in=args.text_in))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, default="damo/speech_timestamp_prediction-v1-16k-offline")
+    parser.add_argument('--audio_in', type=str, default="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav")
+    parser.add_argument('--text_in', type=str, default="一 个 东 太 平 洋 国 家 为 什 么 跑 到 西 太 平 洋 来 了 呢")
+    parser.add_argument('--output_dir', type=str, default="./results/")
+    parser.add_argument('--batch_size', type=int, default=1)
+    parser.add_argument('--gpuid', type=str, default="0")
+    args = parser.parse_args()
+    modelscope_infer(args)
--- a/egs_modelscope/tp/TEMPLATE/infer.sh
+++ b/egs_modelscope/tp/TEMPLATE/infer.sh
@ -37,7 +37,7 @@ for JOB in $(seq ${nj}); do
    split_texts="$split_texts $output_dir/split/text.$JOB.scp"
 done
 perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
-perl utils/split_scp.pl ${data_dir}/text.scp ${split_texts}
+perl utils/split_scp.pl ${data_dir}/text.txt ${split_texts}

 if [ -n "${checkpoint_dir}" ]; then
  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
--- a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md
@ -1,25 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained ModelScope Model
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>text_in:</strong> # support text, text url.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-
- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
-
-
-Modify inference related parameters in vad.yaml.
-
- max_end_silence_time: The end-point silence duration  to judge the end of sentence, the parameter range is 500ms~6000ms, and the default value is 800ms
- speech_noise_thres:  The balance of speech and silence scores, the parameter range is (-1,1)
-    - The value tends to -1, the greater probability of noise being judged as speech
-    - The value tends to 1, the greater probability of speech being judged as noise
--- a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md
@ -0,0 +1 @@
+../../TEMPLATE/README.md
--- a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo.py
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo.py
@ -0,0 +1,12 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipline = pipeline(
+    task=Tasks.speech_timestamp,
+    model='damo/speech_timestamp_prediction-v1-16k-offline',
+    output_dir=None)
+
+rec_result = inference_pipline(
+    audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav',
+    text_in='一 个 东 太 平 洋 国 家 为 什 么 跑 到 西 太 平 洋 来 了 呢',)
+print(rec_result)
--- a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py
@ -1,28 +0,0 @@
-import os
-import argparse
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-def modelscope_infer(args):
-    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
-    inference_pipeline = pipeline(
-        task=Tasks.speech_timestamp,
-        model=args.model,
-        output_dir=args.output_dir,
-        batch_size=args.batch_size,
-    )
-    if args.output_dir is not None:
-        inference_pipeline(audio_in=args.audio_in, text_in=args.text_in)
-    else:
-        print(inference_pipeline(audio_in=args.audio_in, text_in=args.text_in))
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--model', type=str, default="damo/speech_timestamp_prediction-v1-16k-offline")
-    parser.add_argument('--audio_in', type=str, default="https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav")
-    parser.add_argument('--text_in', type=str, default="一 个 东 太 平 洋 国 家 为 什 么 跑 到 西 太 平 洋 来 了 呢")
-    parser.add_argument('--output_dir', type=str, default="./results/")
-    parser.add_argument('--batch_size', type=int, default=1)
-    parser.add_argument('--gpuid', type=str, default="0")
-    args = parser.parse_args()
-    modelscope_infer(args)
--- a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py
@ -0,0 +1 @@
+../../TEMPLATE/infer.py
--- a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.sh
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.sh
@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
--- a/egs_modelscope/vad/TEMPLATE/README.md
+++ b/egs_modelscope/vad/TEMPLATE/README.md
@ -86,7 +86,7 @@ FunASR also offer recipes [egs_modelscope/vad/TEMPLATE/infer.sh](https://github.
    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
    --data_dir "./data/test" \
    --output_dir "./results" \
-    --batch_size 64 \
+    --batch_size 1 \
    --gpu_inference true \
    --gpuid_list "0,1"
 ```
@ -97,7 +97,7 @@ FunASR also offer recipes [egs_modelscope/vad/TEMPLATE/infer.sh](https://github.
    --data_dir "./data/test" \
    --output_dir "./results" \
    --gpu_inference false \
-    --njob 64
+    --njob 1
 ```

 ## Finetune with pipeline
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md
@ -1,24 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained ModelScope Model
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-
- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
-
-
-Modify inference related parameters in vad.yaml.
-
- max_end_silence_time: The end-point silence duration  to judge the end of sentence, the parameter range is 500ms~6000ms, and the default value is 800ms
- speech_noise_thres:  The balance of speech and silence scores, the parameter range is (-1,1)
-    - The value tends to -1, the greater probability of noise being judged as speech
-    - The value tends to 1, the greater probability of speech being judged as noise
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md
@ -0,0 +1 @@
+../../TEMPLATE/README.md
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py
@ -0,0 +1,15 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == '__main__':
+    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav'
+    output_dir = None
+    inference_pipline = pipeline(
+        task=Tasks.voice_activity_detection,
+        model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+        model_revision='v1.2.0',
+        output_dir=output_dir,
+        batch_size=1,
+    )
+    segments_result = inference_pipline(audio_in=audio_in)
+    print(segments_result)
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
@ -1,15 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == '__main__':
-    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav'
-    output_dir = None
-    inference_pipline = pipeline(
-        task=Tasks.voice_activity_detection,
-        model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
-        model_revision='v1.2.0',
-        output_dir=output_dir,
-        batch_size=1,
-    )
-    segments_result = inference_pipline(audio_in=audio_in)
-    print(segments_result)
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
@ -0,0 +1 @@
+../../TEMPLATE/infer.py
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh
@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md
@ -1,24 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained ModelScope Model
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-
- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
-
-
-Modify inference related parameters in vad.yaml.
-
- max_end_silence_time: The end-point silence duration  to judge the end of sentence, the parameter range is 500ms~6000ms, and the default value is 800ms
- speech_noise_thres:  The balance of speech and silence scores, the parameter range is (-1,1)
-    - The value tends to -1, the greater probability of noise being judged as speech
-    - The value tends to 1, the greater probability of speech being judged as noise
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md
@ -0,0 +1 @@
+../../TEMPLATE/README.md
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py
@ -0,0 +1,15 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == '__main__':
+    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example_8k.wav'
+    output_dir = None
+    inference_pipline = pipeline(
+        task=Tasks.voice_activity_detection,
+        model="damo/speech_fsmn_vad_zh-cn-8k-common",
+        model_revision='v1.2.0',
+        output_dir=output_dir,
+        batch_size=1,
+    )
+    segments_result = inference_pipline(audio_in=audio_in)
+    print(segments_result)
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
@ -1,15 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == '__main__':
-    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example_8k.wav'
-    output_dir = None
-    inference_pipline = pipeline(
-        task=Tasks.voice_activity_detection,
-        model="damo/speech_fsmn_vad_zh-cn-8k-common",
-        model_revision='v1.2.0',
-        output_dir=output_dir,
-        batch_size=1,
-    )
-    segments_result = inference_pipline(audio_in=audio_in)
-    print(segments_result)
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
@ -0,0 +1 @@
+../../TEMPLATE/infer.py
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh
@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
				`@ -1 +0,0 @@`
				`../speech_timestamp_prediction-v1-16k-offline/infer.py`