diff --git a/docs/modescope_pipeline/vad_pipeline.md b/docs/modescope_pipeline/vad_pipeline.md deleted file mode 100644 index ca8a5ee10..000000000 --- a/docs/modescope_pipeline/vad_pipeline.md +++ /dev/null @@ -1,111 +0,0 @@ -# Voice Activity Detection - -> **Note**: -> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take model of FSMN-VAD as example to demonstrate the usage. - -## Inference - -### Quick start -#### [FSMN-VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) -```python -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -inference_pipeline = pipeline( - task=Tasks.voice_activity_detection, - model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch', -) - -segments_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav') -print(segments_result) -``` -#### [FSMN-VAD-online model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) -```python -inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch', - ) -import soundfile -speech, sample_rate = soundfile.read("example/asr_example.wav") - -param_dict = {"in_cache": dict(), "is_final": False} -chunk_stride = 1600# 100ms -# first chunk, 100ms -speech_chunk = speech[0:chunk_stride] -rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict) -print(rec_result) -# next chunk, 480ms -speech_chunk = speech[chunk_stride:chunk_stride+chunk_stride] -rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict) -print(rec_result) -``` -Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/236) - - - -#### API-reference -##### Define pipeline -- `task`: `Tasks.auto_speech_recognition` -- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk -- `ngpu`: `1` (Defalut), decoding on GPU. If ngpu=0, decoding on CPU -- `ncpu`: `1` (Defalut), sets the number of threads used for intraop parallelism on CPU -- `output_dir`: `None` (Defalut), the output path of results if set -- `batch_size`: `1` (Defalut), batch size when decoding -##### Infer pipeline -- `audio_in`: the input to decode, which could be: - - wav_path, `e.g.`: asr_example.wav, - - pcm_path, `e.g.`: asr_example.pcm, - - audio bytes stream, `e.g.`: bytes data from a microphone - - audio sample point,`e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, the dtype is numpy.ndarray or torch.Tensor - - wav.scp, kaldi style wav list (`wav_id \t wav_path``), `e.g.`: - ```text - asr_example1 ./audios/asr_example1.wav - asr_example2 ./audios/asr_example2.wav - ``` - In this case of `wav.scp` input, `output_dir` must be set to save the output results -- `audio_fs`: audio sampling rate, only set when audio_in is pcm audio -- `output_dir`: None (Defalut), the output path of results if set - -### Inference with multi-thread CPUs or multi GPUs -FunASR also offer recipes [infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs. - -- Setting parameters in `infer.sh` - - `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk - - `data_dir`: the dataset dir needs to include `wav.scp`. If `${data_dir}/text` is also exists, CER will be computed - - `output_dir`: output dir of the recognition results - - `batch_size`: `64` (Default), batch size of inference on gpu - - `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference - - `gpuid_list`: `0,1` (Default), which gpu_ids are used to infer - - `njob`: only used for CPU inference (`gpu_inference`=`false`), `64` (Default), the number of jobs for CPU decoding - - `checkpoint_dir`: only used for infer finetuned models, the path dir of finetuned models - - `checkpoint_name`: only used for infer finetuned models, `valid.cer_ctc.ave.pb` (Default), which checkpoint is used to infer - -- Decode with multi GPUs: -```shell - bash infer.sh \ - --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \ - --data_dir "./data/test" \ - --output_dir "./results" \ - --batch_size 64 \ - --gpu_inference true \ - --gpuid_list "0,1" -``` -- Decode with multi-thread CPUs: -```shell - bash infer.sh \ - --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \ - --data_dir "./data/test" \ - --output_dir "./results" \ - --gpu_inference false \ - --njob 64 -``` - - -## Finetune with pipeline - -### Quick start - -### Finetune with your data - -## Inference with your finetuned model - diff --git a/docs/modescope_pipeline/vad_pipeline.md b/docs/modescope_pipeline/vad_pipeline.md new file mode 120000 index 000000000..30ea6fc1e --- /dev/null +++ b/docs/modescope_pipeline/vad_pipeline.md @@ -0,0 +1 @@ +../../egs_modelscope/vad/TEMPLATE/README.md \ No newline at end of file diff --git a/egs_modelscope/vad/TEMPLATE/README.md b/egs_modelscope/vad/TEMPLATE/README.md new file mode 100644 index 000000000..84601b01a --- /dev/null +++ b/egs_modelscope/vad/TEMPLATE/README.md @@ -0,0 +1,110 @@ +# Voice Activity Detection + +> **Note**: +> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take model of FSMN-VAD as example to demonstrate the usage. + +## Inference + +### Quick start +#### [FSMN-VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) +```python +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks + +inference_pipeline = pipeline( + task=Tasks.voice_activity_detection, + model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch', +) + +segments_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav') +print(segments_result) +``` +#### [FSMN-VAD-online model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) +```python +inference_pipeline = pipeline( + task=Tasks.auto_speech_recognition, + model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch', + ) +import soundfile +speech, sample_rate = soundfile.read("example/asr_example.wav") + +param_dict = {"in_cache": dict(), "is_final": False} +chunk_stride = 1600# 100ms +# first chunk, 100ms +speech_chunk = speech[0:chunk_stride] +rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict) +print(rec_result) +# next chunk, 480ms +speech_chunk = speech[chunk_stride:chunk_stride+chunk_stride] +rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict) +print(rec_result) +``` +Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/236) + + + +#### API-reference +##### Define pipeline +- `task`: `Tasks.auto_speech_recognition` +- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk +- `ngpu`: `1` (Defalut), decoding on GPU. If ngpu=0, decoding on CPU +- `ncpu`: `1` (Defalut), sets the number of threads used for intraop parallelism on CPU +- `output_dir`: `None` (Defalut), the output path of results if set +- `batch_size`: `1` (Defalut), batch size when decoding +##### Infer pipeline +- `audio_in`: the input to decode, which could be: + - wav_path, `e.g.`: asr_example.wav, + - pcm_path, `e.g.`: asr_example.pcm, + - audio bytes stream, `e.g.`: bytes data from a microphone + - audio sample point,`e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, the dtype is numpy.ndarray or torch.Tensor + - wav.scp, kaldi style wav list (`wav_id \t wav_path``), `e.g.`: + ```text + asr_example1 ./audios/asr_example1.wav + asr_example2 ./audios/asr_example2.wav + ``` + In this case of `wav.scp` input, `output_dir` must be set to save the output results +- `audio_fs`: audio sampling rate, only set when audio_in is pcm audio +- `output_dir`: None (Defalut), the output path of results if set + +### Inference with multi-thread CPUs or multi GPUs +FunASR also offer recipes [infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs. + +- Setting parameters in `infer.sh` + - `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk + - `data_dir`: the dataset dir needs to include `wav.scp` + - `output_dir`: output dir of the recognition results + - `batch_size`: `64` (Default), batch size of inference on gpu + - `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference + - `gpuid_list`: `0,1` (Default), which gpu_ids are used to infer + - `njob`: only used for CPU inference (`gpu_inference`=`false`), `64` (Default), the number of jobs for CPU decoding + - `checkpoint_dir`: only used for infer finetuned models, the path dir of finetuned models + - `checkpoint_name`: only used for infer finetuned models, `valid.cer_ctc.ave.pb` (Default), which checkpoint is used to infer + +- Decode with multi GPUs: +```shell + bash infer.sh \ + --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \ + --data_dir "./data/test" \ + --output_dir "./results" \ + --batch_size 64 \ + --gpu_inference true \ + --gpuid_list "0,1" +``` +- Decode with multi-thread CPUs: +```shell + bash infer.sh \ + --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \ + --data_dir "./data/test" \ + --output_dir "./results" \ + --gpu_inference false \ + --njob 64 +``` + +## Finetune with pipeline + +### Quick start + +### Finetune with your data + +## Inference with your finetuned model + diff --git a/egs_modelscope/vad/TEMPLATE/infer.py b/egs_modelscope/vad/TEMPLATE/infer.py new file mode 100644 index 000000000..3d9ee5520 --- /dev/null +++ b/egs_modelscope/vad/TEMPLATE/infer.py @@ -0,0 +1,25 @@ +import os +import shutil +import argparse +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks + +def modelscope_infer(args): + os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid) + inference_pipeline = pipeline( + task=Tasks.voice_activity_detection, + model=args.model, + output_dir=args.output_dir, + batch_size=args.batch_size, + ) + inference_pipeline(audio_in=args.audio_in) + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") + parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp") + parser.add_argument('--output_dir', type=str, default="./results/") + parser.add_argument('--batch_size', type=int, default=64) + parser.add_argument('--gpuid', type=str, default="0") + args = parser.parse_args() + modelscope_infer(args) \ No newline at end of file diff --git a/egs_modelscope/vad/TEMPLATE/infer.sh b/egs_modelscope/vad/TEMPLATE/infer.sh new file mode 100644 index 000000000..261b5e66b --- /dev/null +++ b/egs_modelscope/vad/TEMPLATE/infer.sh @@ -0,0 +1,71 @@ +#!/usr/bin/env bash + +set -e +set -u +set -o pipefail + +stage=1 +stop_stage=2 +model="damo/speech_fsmn_vad_zh-cn-16k-common" +data_dir="./data/test" +output_dir="./results" +batch_size=64 +gpu_inference=true # whether to perform gpu decoding +gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1" +njob=64 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob +checkpoint_dir= +checkpoint_name="valid.cer_ctc.ave.pb" + +. utils/parse_options.sh || exit 1; + +if ${gpu_inference} == "true"; then + nj=$(echo $gpuid_list | awk -F "," '{print NF}') +else + nj=$njob + batch_size=1 + gpuid_list="" + for JOB in $(seq ${nj}); do + gpuid_list=$gpuid_list"-1," + done +fi + +mkdir -p $output_dir/split +split_scps="" +for JOB in $(seq ${nj}); do + split_scps="$split_scps $output_dir/split/wav.$JOB.scp" +done +perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps} + +if ${checkpoint_dir}; then + python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name} + model=${checkpoint_dir}/${model} +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then + echo "Decoding ..." + gpuid_list_array=(${gpuid_list//,/ }) + for JOB in $(seq ${nj}); do + { + id=$((JOB-1)) + gpuid=${gpuid_list_array[$id]} + mkdir -p ${output_dir}/output.$JOB + python infer.py \ + --model ${model} \ + --audio_in ${output_dir}/split/wav.$JOB.scp \ + --output_dir ${output_dir}/output.$JOB \ + --batch_size ${batch_size} \ + --gpuid ${gpuid} + }& + done + wait + + mkdir -p ${output_dir}/1best_recog + for f in token score text; do + if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then + for i in $(seq "${nj}"); do + cat "${output_dir}/output.${i}/1best_recog/${f}" + done | sort -k1 >"${output_dir}/1best_recog/${f}" + fi + done +fi + diff --git a/egs_modelscope/vad/TEMPLATE/utils b/egs_modelscope/vad/TEMPLATE/utils new file mode 120000 index 000000000..dc7d4171f --- /dev/null +++ b/egs_modelscope/vad/TEMPLATE/utils @@ -0,0 +1 @@ +../../../egs/aishell/transformer/utils \ No newline at end of file