From 3d17f80b997a057dc14237419b27a5f61c583f15 Mon Sep 17 00:00:00 2001 From: "haoneng.lhn" Date: Wed, 11 Oct 2023 16:24:22 +0800 Subject: [PATCH 1/3] update github io --- .github/workflows/main.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 9f3d9bede..eec8f3dce 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -23,7 +23,7 @@ jobs: pre-build-command: "pip install sphinx-markdown-tables nbsphinx jinja2 recommonmark sphinx_rtd_theme myst-parser" - name: deploy copy - if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev_wjm' || github.ref == 'refs/heads/dev_lyh' + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev_wjm' || github.ref == 'refs/heads/dev_lyh' || github.ref == 'refs/heads/dev_lhn' run: | mkdir public touch public/.nojekyll @@ -35,7 +35,7 @@ jobs: cp -r docs/m2met2/_build/html/* public/m2met2/ - name: deploy github.io pages - if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev_wjm' || github.ref == 'refs/heads/dev_lyh' + if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev_wjm' || github.ref == 'refs/heads/dev_lyh' || github.ref == 'refs/heads/dev_lhn' uses: peaceiris/actions-gh-pages@v2.3.1 env: GITHUB_TOKEN: ${{ secrets.ACCESS_TOKEN }} From 191b8018f2e1a99e58b999a0ecfbc3f72526e669 Mon Sep 17 00:00:00 2001 From: "haoneng.lhn" Date: Wed, 11 Oct 2023 17:19:38 +0800 Subject: [PATCH 2/3] update --- funasr/datasets/large_datasets/dataset.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/funasr/datasets/large_datasets/dataset.py b/funasr/datasets/large_datasets/dataset.py index 6c166a505..adfe4f6d9 100644 --- a/funasr/datasets/large_datasets/dataset.py +++ b/funasr/datasets/large_datasets/dataset.py @@ -108,7 +108,7 @@ class AudioDataset(IterableDataset): ark_reader = ReadHelper('ark:{}'.format(data_file)) reader_list.append(ark_reader) elif data_type == "text" or data_type == "sound" or data_type == 'text_hotword': - text_reader = open(data_file, "r") + text_reader = open(data_file, "r", encoding="utf-8") reader_list.append(text_reader) elif data_type == "none": continue @@ -205,7 +205,7 @@ def Dataset(data_list_file, # pre_prob = conf.get("pre_prob", 0) # unused yet if pre_hwfile is not None: pre_hwlist = [] - with open(pre_hwfile, 'r') as fin: + with open(pre_hwfile, 'r', encoding="utf-8") as fin: for line in fin.readlines(): pre_hwlist.append(line.strip()) else: From 4f32163c0d7f4b43626cb21a76328dafb59cc2e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8C=97=E5=BF=B5?= Date: Wed, 11 Oct 2023 19:07:53 +0800 Subject: [PATCH 3/3] add paraformer-en-long inference demo --- .../README.md | 1 + .../README_zh.md | 1 + .../demo.py | 18 +++ .../infer.py | 27 +++++ .../infer.sh | 103 ++++++++++++++++++ .../utils | 1 + 6 files changed, 151 insertions(+) create mode 120000 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/README.md create mode 120000 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/README_zh.md create mode 100644 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/demo.py create mode 100644 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.py create mode 100644 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.sh create mode 120000 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/utils diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/README.md b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/README.md new file mode 120000 index 000000000..92088a21d --- /dev/null +++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/README.md @@ -0,0 +1 @@ +../TEMPLATE/README.md \ No newline at end of file diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/README_zh.md b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/README_zh.md new file mode 120000 index 000000000..b88b7fb57 --- /dev/null +++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/README_zh.md @@ -0,0 +1 @@ +../TEMPLATE/README_zh.md \ No newline at end of file diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/demo.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/demo.py new file mode 100644 index 000000000..671e5c7ea --- /dev/null +++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/demo.py @@ -0,0 +1,18 @@ +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks + +if __name__ == '__main__': + audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav' + output_dir = "./results" + inference_pipeline = pipeline( + task=Tasks.auto_speech_recognition, + model='damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020', + model_revision='v1.0.0', + vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch', + punc_model='damo/punc_ct-transformer_cn-en-common-vocab471067-large', + punc_model_revision='v1.0.0', + output_dir=output_dir, + ) + rec_result = inference_pipeline(audio_in=audio_in, batch_size_token=5000, batch_size_token_threshold_s=40, max_single_segment_time=6000) + print(rec_result) + diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.py new file mode 100644 index 000000000..f54399a14 --- /dev/null +++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.py @@ -0,0 +1,27 @@ +import os +import shutil +import argparse +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks + +def modelscope_infer(args): + os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid) + inference_pipeline = pipeline( + task=Tasks.auto_speech_recognition, + model=args.model, + output_dir=args.output_dir, + param_dict={"decoding_model": args.decoding_mode, "hotword": args.hotword_txt} + ) + inference_pipeline(audio_in=args.audio_in, batch_size_token=args.batch_size_token) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") + parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp") + parser.add_argument('--output_dir', type=str, default="./results/") + parser.add_argument('--decoding_mode', type=str, default="normal") + parser.add_argument('--hotword_txt', type=str, default=None) + parser.add_argument('--batch_size_token', type=int, default=5000) + parser.add_argument('--gpuid', type=str, default="0") + args = parser.parse_args() + modelscope_infer(args) diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.sh b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.sh new file mode 100644 index 000000000..ef49d7a60 --- /dev/null +++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash + +set -e +set -u +set -o pipefail + +stage=1 +stop_stage=2 +model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" +data_dir="./data/test" +output_dir="./results" +batch_size=64 +gpu_inference=true # whether to perform gpu decoding +gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1" +njob=64 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob +checkpoint_dir= +checkpoint_name="valid.cer_ctc.ave.pb" + +. utils/parse_options.sh || exit 1; + +if ${gpu_inference} == "true"; then + nj=$(echo $gpuid_list | awk -F "," '{print NF}') +else + nj=$njob + batch_size=1 + gpuid_list="" + for JOB in $(seq ${nj}); do + gpuid_list=$gpuid_list"-1," + done +fi + +mkdir -p $output_dir/split +split_scps="" +for JOB in $(seq ${nj}); do + split_scps="$split_scps $output_dir/split/wav.$JOB.scp" +done +perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps} + +if [ -n "${checkpoint_dir}" ]; then + python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name} + model=${checkpoint_dir}/${model} +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then + echo "Decoding ..." + gpuid_list_array=(${gpuid_list//,/ }) + for JOB in $(seq ${nj}); do + { + id=$((JOB-1)) + gpuid=${gpuid_list_array[$id]} + mkdir -p ${output_dir}/output.$JOB + python infer.py \ + --model ${model} \ + --audio_in ${output_dir}/split/wav.$JOB.scp \ + --output_dir ${output_dir}/output.$JOB \ + --batch_size ${batch_size} \ + --gpuid ${gpuid} + }& + done + wait + + mkdir -p ${output_dir}/1best_recog + for f in token score text; do + if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then + for i in $(seq "${nj}"); do + cat "${output_dir}/output.${i}/1best_recog/${f}" + done | sort -k1 >"${output_dir}/1best_recog/${f}" + fi + done +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then + echo "Computing WER ..." + cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc + cp ${data_dir}/text ${output_dir}/1best_recog/text.ref + python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer + tail -n 3 ${output_dir}/1best_recog/text.cer +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then + echo "SpeechIO TIOBE textnorm" + echo "$0 --> Normalizing REF text ..." + ./utils/textnorm_zh.py \ + --has_key --to_upper \ + ${data_dir}/text \ + ${output_dir}/1best_recog/ref.txt + + echo "$0 --> Normalizing HYP text ..." + ./utils/textnorm_zh.py \ + --has_key --to_upper \ + ${output_dir}/1best_recog/text.proc \ + ${output_dir}/1best_recog/rec.txt + grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt + + echo "$0 --> computing WER/CER and alignment ..." + ./utils/error_rate_zh \ + --tokenizer char \ + --ref ${output_dir}/1best_recog/ref.txt \ + --hyp ${output_dir}/1best_recog/rec_non_empty.txt \ + ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt + rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt +fi + diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/utils b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/utils new file mode 120000 index 000000000..3d3dd06b0 --- /dev/null +++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/utils @@ -0,0 +1 @@ +../../asr/TEMPLATE/utils \ No newline at end of file