diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md index 92088a21d..bb55ab52e 120000 --- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md +++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md @@ -1 +1 @@ -../TEMPLATE/README.md \ No newline at end of file +../../TEMPLATE/README.md \ No newline at end of file diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh deleted file mode 120000 index 0b3b38b6f..000000000 --- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh +++ /dev/null @@ -1 +0,0 @@ -../TEMPLATE/infer.sh \ No newline at end of file diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh new file mode 100644 index 000000000..ef49d7a60 --- /dev/null +++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash + +set -e +set -u +set -o pipefail + +stage=1 +stop_stage=2 +model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" +data_dir="./data/test" +output_dir="./results" +batch_size=64 +gpu_inference=true # whether to perform gpu decoding +gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1" +njob=64 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob +checkpoint_dir= +checkpoint_name="valid.cer_ctc.ave.pb" + +. utils/parse_options.sh || exit 1; + +if ${gpu_inference} == "true"; then + nj=$(echo $gpuid_list | awk -F "," '{print NF}') +else + nj=$njob + batch_size=1 + gpuid_list="" + for JOB in $(seq ${nj}); do + gpuid_list=$gpuid_list"-1," + done +fi + +mkdir -p $output_dir/split +split_scps="" +for JOB in $(seq ${nj}); do + split_scps="$split_scps $output_dir/split/wav.$JOB.scp" +done +perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps} + +if [ -n "${checkpoint_dir}" ]; then + python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name} + model=${checkpoint_dir}/${model} +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then + echo "Decoding ..." + gpuid_list_array=(${gpuid_list//,/ }) + for JOB in $(seq ${nj}); do + { + id=$((JOB-1)) + gpuid=${gpuid_list_array[$id]} + mkdir -p ${output_dir}/output.$JOB + python infer.py \ + --model ${model} \ + --audio_in ${output_dir}/split/wav.$JOB.scp \ + --output_dir ${output_dir}/output.$JOB \ + --batch_size ${batch_size} \ + --gpuid ${gpuid} + }& + done + wait + + mkdir -p ${output_dir}/1best_recog + for f in token score text; do + if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then + for i in $(seq "${nj}"); do + cat "${output_dir}/output.${i}/1best_recog/${f}" + done | sort -k1 >"${output_dir}/1best_recog/${f}" + fi + done +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then + echo "Computing WER ..." + cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc + cp ${data_dir}/text ${output_dir}/1best_recog/text.ref + python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer + tail -n 3 ${output_dir}/1best_recog/text.cer +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then + echo "SpeechIO TIOBE textnorm" + echo "$0 --> Normalizing REF text ..." + ./utils/textnorm_zh.py \ + --has_key --to_upper \ + ${data_dir}/text \ + ${output_dir}/1best_recog/ref.txt + + echo "$0 --> Normalizing HYP text ..." + ./utils/textnorm_zh.py \ + --has_key --to_upper \ + ${output_dir}/1best_recog/text.proc \ + ${output_dir}/1best_recog/rec.txt + grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt + + echo "$0 --> computing WER/CER and alignment ..." + ./utils/error_rate_zh \ + --tokenizer char \ + --ref ${output_dir}/1best_recog/ref.txt \ + --hyp ${output_dir}/1best_recog/rec_non_empty.txt \ + ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt + rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt +fi + diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md index 92088a21d..bb55ab52e 120000 --- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md +++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md @@ -1 +1 @@ -../TEMPLATE/README.md \ No newline at end of file +../../TEMPLATE/README.md \ No newline at end of file diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py index f05fbbb8b..128fc31c2 120000 --- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py +++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py @@ -1 +1 @@ -../TEMPLATE/infer.py \ No newline at end of file +../../TEMPLATE/infer.py \ No newline at end of file diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh deleted file mode 120000 index 0b3b38b6f..000000000 --- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh +++ /dev/null @@ -1 +0,0 @@ -../TEMPLATE/infer.sh \ No newline at end of file diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh new file mode 100644 index 000000000..207bbdf04 --- /dev/null +++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash + +set -e +set -u +set -o pipefail + +stage=1 +stop_stage=2 +model="damo/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch" +data_dir="./data/test" +output_dir="./results" +batch_size=64 +gpu_inference=true # whether to perform gpu decoding +gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1" +njob=64 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob +checkpoint_dir= +checkpoint_name="valid.cer_ctc.ave.pb" + +. utils/parse_options.sh || exit 1; + +if ${gpu_inference} == "true"; then + nj=$(echo $gpuid_list | awk -F "," '{print NF}') +else + nj=$njob + batch_size=1 + gpuid_list="" + for JOB in $(seq ${nj}); do + gpuid_list=$gpuid_list"-1," + done +fi + +mkdir -p $output_dir/split +split_scps="" +for JOB in $(seq ${nj}); do + split_scps="$split_scps $output_dir/split/wav.$JOB.scp" +done +perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps} + +if [ -n "${checkpoint_dir}" ]; then + python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name} + model=${checkpoint_dir}/${model} +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then + echo "Decoding ..." + gpuid_list_array=(${gpuid_list//,/ }) + for JOB in $(seq ${nj}); do + { + id=$((JOB-1)) + gpuid=${gpuid_list_array[$id]} + mkdir -p ${output_dir}/output.$JOB + python infer.py \ + --model ${model} \ + --audio_in ${output_dir}/split/wav.$JOB.scp \ + --output_dir ${output_dir}/output.$JOB \ + --batch_size ${batch_size} \ + --gpuid ${gpuid} + }& + done + wait + + mkdir -p ${output_dir}/1best_recog + for f in token score text; do + if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then + for i in $(seq "${nj}"); do + cat "${output_dir}/output.${i}/1best_recog/${f}" + done | sort -k1 >"${output_dir}/1best_recog/${f}" + fi + done +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then + echo "Computing WER ..." + cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc + cp ${data_dir}/text ${output_dir}/1best_recog/text.ref + python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer + tail -n 3 ${output_dir}/1best_recog/text.cer +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then + echo "SpeechIO TIOBE textnorm" + echo "$0 --> Normalizing REF text ..." + ./utils/textnorm_zh.py \ + --has_key --to_upper \ + ${data_dir}/text \ + ${output_dir}/1best_recog/ref.txt + + echo "$0 --> Normalizing HYP text ..." + ./utils/textnorm_zh.py \ + --has_key --to_upper \ + ${output_dir}/1best_recog/text.proc \ + ${output_dir}/1best_recog/rec.txt + grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt + + echo "$0 --> computing WER/CER and alignment ..." + ./utils/error_rate_zh \ + --tokenizer char \ + --ref ${output_dir}/1best_recog/ref.txt \ + --hyp ${output_dir}/1best_recog/rec_non_empty.txt \ + ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt + rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt +fi + diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md index 92088a21d..bb55ab52e 120000 --- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md +++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md @@ -1 +1 @@ -../TEMPLATE/README.md \ No newline at end of file +../../TEMPLATE/README.md \ No newline at end of file diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py index f05fbbb8b..128fc31c2 120000 --- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py +++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py @@ -1 +1 @@ -../TEMPLATE/infer.py \ No newline at end of file +../../TEMPLATE/infer.py \ No newline at end of file diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh deleted file mode 120000 index 0b3b38b6f..000000000 --- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh +++ /dev/null @@ -1 +0,0 @@ -../TEMPLATE/infer.sh \ No newline at end of file diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh new file mode 100644 index 000000000..4b59bc102 --- /dev/null +++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash + +set -e +set -u +set -o pipefail + +stage=1 +stop_stage=2 +model="damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch" +data_dir="./data/test" +output_dir="./results" +batch_size=64 +gpu_inference=true # whether to perform gpu decoding +gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1" +njob=64 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob +checkpoint_dir= +checkpoint_name="valid.cer_ctc.ave.pb" + +. utils/parse_options.sh || exit 1; + +if ${gpu_inference} == "true"; then + nj=$(echo $gpuid_list | awk -F "," '{print NF}') +else + nj=$njob + batch_size=1 + gpuid_list="" + for JOB in $(seq ${nj}); do + gpuid_list=$gpuid_list"-1," + done +fi + +mkdir -p $output_dir/split +split_scps="" +for JOB in $(seq ${nj}); do + split_scps="$split_scps $output_dir/split/wav.$JOB.scp" +done +perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps} + +if [ -n "${checkpoint_dir}" ]; then + python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name} + model=${checkpoint_dir}/${model} +fi + +if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then + echo "Decoding ..." + gpuid_list_array=(${gpuid_list//,/ }) + for JOB in $(seq ${nj}); do + { + id=$((JOB-1)) + gpuid=${gpuid_list_array[$id]} + mkdir -p ${output_dir}/output.$JOB + python infer.py \ + --model ${model} \ + --audio_in ${output_dir}/split/wav.$JOB.scp \ + --output_dir ${output_dir}/output.$JOB \ + --batch_size ${batch_size} \ + --gpuid ${gpuid} + }& + done + wait + + mkdir -p ${output_dir}/1best_recog + for f in token score text; do + if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then + for i in $(seq "${nj}"); do + cat "${output_dir}/output.${i}/1best_recog/${f}" + done | sort -k1 >"${output_dir}/1best_recog/${f}" + fi + done +fi + +if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then + echo "Computing WER ..." + cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc + cp ${data_dir}/text ${output_dir}/1best_recog/text.ref + python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer + tail -n 3 ${output_dir}/1best_recog/text.cer +fi + +if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then + echo "SpeechIO TIOBE textnorm" + echo "$0 --> Normalizing REF text ..." + ./utils/textnorm_zh.py \ + --has_key --to_upper \ + ${data_dir}/text \ + ${output_dir}/1best_recog/ref.txt + + echo "$0 --> Normalizing HYP text ..." + ./utils/textnorm_zh.py \ + --has_key --to_upper \ + ${output_dir}/1best_recog/text.proc \ + ${output_dir}/1best_recog/rec.txt + grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt + + echo "$0 --> computing WER/CER and alignment ..." + ./utils/error_rate_zh \ + --tokenizer char \ + --ref ${output_dir}/1best_recog/ref.txt \ + --hyp ${output_dir}/1best_recog/rec_non_empty.txt \ + ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt + rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt +fi + diff --git a/funasr/runtime/grpc/paraformer-server.cc b/funasr/runtime/grpc/paraformer-server.cc index 3bc011aea..734dadc6a 100644 --- a/funasr/runtime/grpc/paraformer-server.cc +++ b/funasr/runtime/grpc/paraformer-server.cc @@ -137,7 +137,7 @@ grpc::Status ASRServicer::Recognize( stream->Write(res); } else { - FUNASR_RESULT Result= FunOfflineRecogPCMBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, 16000, RASR_NONE, NULL); + FUNASR_RESULT Result= FunOfflineInferBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, RASR_NONE, NULL, 16000); std::string asr_result = ((FUNASR_RECOG_RESULT*)Result)->msg; auto end_time = std::chrono::duration_cast(std::chrono::system_clock::now().time_since_epoch()).count(); diff --git a/funasr/runtime/onnxruntime/include/funasrruntime.h b/funasr/runtime/onnxruntime/include/funasrruntime.h index 75be80e5c..5cfdb47d3 100644 --- a/funasr/runtime/onnxruntime/include/funasrruntime.h +++ b/funasr/runtime/onnxruntime/include/funasrruntime.h @@ -46,15 +46,20 @@ typedef enum { FUNASR_MODEL_PARAFORMER = 3, }FUNASR_MODEL_TYPE; +typedef enum +{ + FSMN_VAD_OFFLINE=0, + FSMN_VAD_ONLINE = 1, +}FSMN_VAD_MODE; + typedef void (* QM_CALLBACK)(int cur_step, int n_total); // n_total: total steps; cur_step: Current Step. // ASR _FUNASRAPI FUNASR_HANDLE FunASRInit(std::map& model_path, int thread_num); - -_FUNASRAPI FUNASR_RESULT FunASRRecogBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback); -_FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback); -_FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* sz_filename, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback); -_FUNASRAPI FUNASR_RESULT FunASRRecogFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback); +// buffer +_FUNASRAPI FUNASR_RESULT FunASRInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000); +// file, support wav & pcm +_FUNASRAPI FUNASR_RESULT FunASRInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000); _FUNASRAPI const char* FunASRGetResult(FUNASR_RESULT result,int n_index); _FUNASRAPI const int FunASRGetRetNumber(FUNASR_RESULT result); @@ -63,9 +68,12 @@ _FUNASRAPI void FunASRUninit(FUNASR_HANDLE handle); _FUNASRAPI const float FunASRGetRetSnippetTime(FUNASR_RESULT result); // VAD -_FUNASRAPI FUNASR_HANDLE FsmnVadInit(std::map& model_path, int thread_num); +_FUNASRAPI FUNASR_HANDLE FsmnVadInit(std::map& model_path, int thread_num, FSMN_VAD_MODE mode=FSMN_VAD_OFFLINE); +// buffer +_FUNASRAPI FUNASR_RESULT FsmnVadInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000); +// file, support wav & pcm +_FUNASRAPI FUNASR_RESULT FsmnVadInfer(FUNASR_HANDLE handle, const char* sz_filename, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000); -_FUNASRAPI FUNASR_RESULT FsmnVadWavFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback); _FUNASRAPI std::vector>* FsmnVadGetResult(FUNASR_RESULT result,int n_index); _FUNASRAPI void FsmnVadFreeResult(FUNASR_RESULT result); _FUNASRAPI void FsmnVadUninit(FUNASR_HANDLE handle); @@ -78,8 +86,10 @@ _FUNASRAPI void CTTransformerUninit(FUNASR_HANDLE handle); //OfflineStream _FUNASRAPI FUNASR_HANDLE FunOfflineInit(std::map& model_path, int thread_num); -_FUNASRAPI FUNASR_RESULT FunOfflineRecogFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback); -_FUNASRAPI FUNASR_RESULT FunOfflineRecogPCMBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback); +// buffer +_FUNASRAPI FUNASR_RESULT FunOfflineInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000); +// file, support wav & pcm +_FUNASRAPI FUNASR_RESULT FunOfflineInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000); _FUNASRAPI void FunOfflineUninit(FUNASR_HANDLE handle); #ifdef __cplusplus diff --git a/funasr/runtime/onnxruntime/include/vad-model.h b/funasr/runtime/onnxruntime/include/vad-model.h index 2a8d6e4d6..e37bd976a 100644 --- a/funasr/runtime/onnxruntime/include/vad-model.h +++ b/funasr/runtime/onnxruntime/include/vad-model.h @@ -16,7 +16,7 @@ class VadModel { virtual void LoadConfigFromYaml(const char* filename)=0; virtual void FbankKaldi(float sample_rate, std::vector> &vad_feats, const std::vector &waves)=0; - virtual std::vector> &LfrCmvn(std::vector> &vad_feats)=0; + virtual void LfrCmvn(std::vector> &vad_feats)=0; virtual void Forward( const std::vector> &chunk_feats, std::vector> *out_prob)=0; @@ -24,6 +24,6 @@ class VadModel { virtual void InitCache()=0; }; -VadModel *CreateVadModel(std::map& model_path, int thread_num); +VadModel *CreateVadModel(std::map& model_path, int thread_num, int mode); } // namespace funasr #endif diff --git a/funasr/runtime/onnxruntime/readme.md b/funasr/runtime/onnxruntime/readme.md index 5b42c3048..8b5d68a77 100644 --- a/funasr/runtime/onnxruntime/readme.md +++ b/funasr/runtime/onnxruntime/readme.md @@ -43,11 +43,10 @@ make ### funasr-onnx-offline ```shell -./funasr-onnx-offline [--wav-scp ] [--wav-path ] - [--punc-quant ] [--punc-dir ] - [--vad-quant ] [--vad-dir ] - [--quantize ] --model-dir - [--] [--version] [-h] +./funasr-onnx-offline --model-dir [--quantize ] + [--vad-dir ] [--vad-quant ] + [--punc-dir ] [--punc-quant ] + --wav-path [--] [--version] [-h] Where: --model-dir (required) the asr model path, which contains model.onnx, config.yaml, am.mvn @@ -64,12 +63,13 @@ Where: --punc-quant false (Default), load the model of model.onnx in punc_dir. If set true, load the model of model_quant.onnx in punc_dir - --wav-scp - wave scp path --wav-path - wave file path + (required) the input could be: + wav_path, e.g.: asr_example.wav; + pcm_path, e.g.: asr_example.pcm; + wav.scp, kaldi style wav list (wav_id \t wav_path) - Required: --model-dir + Required: --model-dir --wav-path If use vad, please add: --vad-dir If use punc, please add: --punc-dir @@ -84,20 +84,20 @@ For example: ### funasr-onnx-offline-vad ```shell -./funasr-onnx-offline-vad [--wav-scp ] [--wav-path ] - [--quantize ] --model-dir - [--] [--version] [-h] +./funasr-onnx-offline-vad --model-dir [--quantize ] + --wav-path [--] [--version] [-h] Where: --model-dir (required) the vad model path, which contains model.onnx, vad.yaml, vad.mvn --quantize false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir - --wav-scp - wave scp path --wav-path - wave file path + (required) the input could be: + wav_path, e.g.: asr_example.wav; + pcm_path, e.g.: asr_example.pcm; + wav.scp, kaldi style wav list (wav_id \t wav_path) - Required: --model-dir + Required: --model-dir --wav-path For example: ./funasr-onnx-offline-vad \ @@ -107,17 +107,17 @@ For example: ### funasr-onnx-offline-punc ```shell -./funasr-onnx-offline-punc [--txt-path ] [--quantize ] - --model-dir [--] [--version] [-h] +./funasr-onnx-offline-punc --model-dir [--quantize ] + --txt-path [--] [--version] [-h] Where: --model-dir (required) the punc model path, which contains model.onnx, punc.yaml --quantize false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir --txt-path - txt file path, one sentence per line + (required) txt file path, one sentence per line - Required: --model-dir + Required: --model-dir --txt-path For example: ./funasr-onnx-offline-punc \ @@ -126,8 +126,8 @@ For example: ``` ### funasr-onnx-offline-rtf ```shell -./funasr-onnx-offline-rtf --thread-num --wav-scp - [--quantize ] --model-dir +./funasr-onnx-offline-rtf --model-dir [--quantize ] + --wav-path --thread-num [--] [--version] [-h] Where: --thread-num @@ -136,8 +136,11 @@ Where: (required) the model path, which contains model.onnx, config.yaml, am.mvn --quantize false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir - --wav-scp - (required) wave scp path + --wav-path + (required) the input could be: + wav_path, e.g.: asr_example.wav; + pcm_path, e.g.: asr_example.pcm; + wav.scp, kaldi style wav list (wav_id \t wav_path) For example: ./funasr-onnx-offline-rtf \ diff --git a/funasr/runtime/onnxruntime/src/fsmn-vad.cpp b/funasr/runtime/onnxruntime/src/fsmn-vad.cpp index 0a646f0ca..f06153498 100644 --- a/funasr/runtime/onnxruntime/src/fsmn-vad.cpp +++ b/funasr/runtime/onnxruntime/src/fsmn-vad.cpp @@ -225,7 +225,7 @@ void FsmnVad::LoadCmvn(const char *filename) } } -std::vector> &FsmnVad::LfrCmvn(std::vector> &vad_feats) { +void FsmnVad::LfrCmvn(std::vector> &vad_feats) { std::vector> out_feats; int T = vad_feats.size(); @@ -264,7 +264,6 @@ std::vector> &FsmnVad::LfrCmvn(std::vector } } vad_feats = out_feats; - return vad_feats; } std::vector> @@ -272,7 +271,7 @@ FsmnVad::Infer(const std::vector &waves) { std::vector> vad_feats; std::vector> vad_probs; FbankKaldi(vad_sample_rate_, vad_feats, waves); - vad_feats = LfrCmvn(vad_feats); + LfrCmvn(vad_feats); Forward(vad_feats, &vad_probs); E2EVadModel vad_scorer = E2EVadModel(); diff --git a/funasr/runtime/onnxruntime/src/fsmn-vad.h b/funasr/runtime/onnxruntime/src/fsmn-vad.h index 7a6707c4d..3d183f8a7 100644 --- a/funasr/runtime/onnxruntime/src/fsmn-vad.h +++ b/funasr/runtime/onnxruntime/src/fsmn-vad.h @@ -36,7 +36,7 @@ private: void FbankKaldi(float sample_rate, std::vector> &vad_feats, const std::vector &waves); - std::vector> &LfrCmvn(std::vector> &vad_feats); + void LfrCmvn(std::vector> &vad_feats); void Forward( const std::vector> &chunk_feats, diff --git a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-punc.cpp b/funasr/runtime/onnxruntime/src/funasr-onnx-offline-punc.cpp index a8ee9a970..e18c27ee7 100644 --- a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-punc.cpp +++ b/funasr/runtime/onnxruntime/src/funasr-onnx-offline-punc.cpp @@ -36,7 +36,7 @@ int main(int argc, char *argv[]) TCLAP::CmdLine cmd("funasr-onnx-offline-punc", ' ', "1.0"); TCLAP::ValueArg model_dir("", MODEL_DIR, "the punc model path, which contains model.onnx, punc.yaml", true, "", "string"); TCLAP::ValueArg quantize("", QUANTIZE, "false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir", false, "false", "string"); - TCLAP::ValueArg txt_path("", TXT_PATH, "txt file path, one sentence per line", false, "", "string"); + TCLAP::ValueArg txt_path("", TXT_PATH, "txt file path, one sentence per line", true, "", "string"); cmd.add(model_dir); cmd.add(quantize); diff --git a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-rtf.cpp b/funasr/runtime/onnxruntime/src/funasr-onnx-offline-rtf.cpp index 76624e768..6ba65c6c4 100644 --- a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-rtf.cpp +++ b/funasr/runtime/onnxruntime/src/funasr-onnx-offline-rtf.cpp @@ -39,7 +39,7 @@ void runReg(FUNASR_HANDLE asr_handle, vector wav_list, // warm up for (size_t i = 0; i < 1; i++) { - FUNASR_RESULT result=FunASRRecogFile(asr_handle, wav_list[0].c_str(), RASR_NONE, NULL); + FUNASR_RESULT result=FunASRInfer(asr_handle, wav_list[0].c_str(), RASR_NONE, NULL, 16000); } while (true) { @@ -50,7 +50,7 @@ void runReg(FUNASR_HANDLE asr_handle, vector wav_list, } gettimeofday(&start, NULL); - FUNASR_RESULT result=FunASRRecogFile(asr_handle, wav_list[i].c_str(), RASR_NONE, NULL); + FUNASR_RESULT result=FunASRInfer(asr_handle, wav_list[i].c_str(), RASR_NONE, NULL, 16000); gettimeofday(&end, NULL); seconds = (end.tv_sec - start.tv_sec); @@ -77,6 +77,15 @@ void runReg(FUNASR_HANDLE asr_handle, vector wav_list, } } +bool is_target_file(const std::string& filename, const std::string target) { + std::size_t pos = filename.find_last_of("."); + if (pos == std::string::npos) { + return false; + } + std::string extension = filename.substr(pos + 1); + return (extension == target); +} + void GetValue(TCLAP::ValueArg& value_arg, string key, std::map& model_path) { if (value_arg.isSet()){ @@ -94,19 +103,19 @@ int main(int argc, char *argv[]) TCLAP::ValueArg model_dir("", MODEL_DIR, "the model path, which contains model.onnx, config.yaml, am.mvn", true, "", "string"); TCLAP::ValueArg quantize("", QUANTIZE, "false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir", false, "false", "string"); - TCLAP::ValueArg wav_scp("", WAV_SCP, "wave scp path", true, "", "string"); + TCLAP::ValueArg wav_path("", WAV_PATH, "the input could be: wav_path, e.g.: asr_example.wav; pcm_path, e.g.: asr_example.pcm; wav.scp, kaldi style wav list (wav_id \t wav_path)", true, "", "string"); TCLAP::ValueArg thread_num("", THREAD_NUM, "multi-thread num for rtf", true, 0, "int32_t"); cmd.add(model_dir); cmd.add(quantize); - cmd.add(wav_scp); + cmd.add(wav_path); cmd.add(thread_num); cmd.parse(argc, argv); std::map model_path; GetValue(model_dir, MODEL_DIR, model_path); GetValue(quantize, QUANTIZE, model_path); - GetValue(wav_scp, WAV_SCP, model_path); + GetValue(wav_path, WAV_PATH, model_path); struct timeval start, end; gettimeofday(&start, NULL); @@ -125,10 +134,14 @@ int main(int argc, char *argv[]) // read wav_scp vector wav_list; - if(model_path.find(WAV_SCP)!=model_path.end()){ - ifstream in(model_path.at(WAV_SCP)); + string wav_path_ = model_path.at(WAV_PATH); + if(is_target_file(wav_path_, "wav") || is_target_file(wav_path_, "pcm")){ + wav_list.emplace_back(wav_path_); + } + else if(is_target_file(wav_path_, "scp")){ + ifstream in(wav_path_); if (!in.is_open()) { - LOG(ERROR) << "Failed to open file: " << model_path.at(WAV_SCP); + LOG(ERROR) << "Failed to open file: " << model_path.at(WAV_SCP) ; return 0; } string line; @@ -140,6 +153,9 @@ int main(int argc, char *argv[]) wav_list.emplace_back(column2); } in.close(); + }else{ + LOG(ERROR)<<"Please check the wav extension!"; + exit(-1); } // 多线程测试 diff --git a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-vad.cpp b/funasr/runtime/onnxruntime/src/funasr-onnx-offline-vad.cpp index 37513ae2a..0f606c6d8 100644 --- a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-vad.cpp +++ b/funasr/runtime/onnxruntime/src/funasr-onnx-offline-vad.cpp @@ -21,6 +21,15 @@ using namespace std; +bool is_target_file(const std::string& filename, const std::string target) { + std::size_t pos = filename.find_last_of("."); + if (pos == std::string::npos) { + return false; + } + std::string extension = filename.substr(pos + 1); + return (extension == target); +} + void GetValue(TCLAP::ValueArg& value_arg, string key, std::map& model_path) { if (value_arg.isSet()){ @@ -58,20 +67,17 @@ int main(int argc, char *argv[]) TCLAP::ValueArg model_dir("", MODEL_DIR, "the vad model path, which contains model.onnx, vad.yaml, vad.mvn", true, "", "string"); TCLAP::ValueArg quantize("", QUANTIZE, "false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir", false, "false", "string"); - TCLAP::ValueArg wav_path("", WAV_PATH, "wave file path", false, "", "string"); - TCLAP::ValueArg wav_scp("", WAV_SCP, "wave scp path", false, "", "string"); + TCLAP::ValueArg wav_path("", WAV_PATH, "the input could be: wav_path, e.g.: asr_example.wav; pcm_path, e.g.: asr_example.pcm; wav.scp, kaldi style wav list (wav_id \t wav_path)", true, "", "string"); cmd.add(model_dir); cmd.add(quantize); cmd.add(wav_path); - cmd.add(wav_scp); cmd.parse(argc, argv); std::map model_path; GetValue(model_dir, MODEL_DIR, model_path); GetValue(quantize, QUANTIZE, model_path); GetValue(wav_path, WAV_PATH, model_path); - GetValue(wav_scp, WAV_SCP, model_path); struct timeval start, end; gettimeofday(&start, NULL); @@ -89,14 +95,14 @@ int main(int argc, char *argv[]) long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s"; - // read wav_path and wav_scp + // read wav_path vector wav_list; - - if(model_path.find(WAV_PATH)!=model_path.end()){ - wav_list.emplace_back(model_path.at(WAV_PATH)); + string wav_path_ = model_path.at(WAV_PATH); + if(is_target_file(wav_path_, "wav") || is_target_file(wav_path_, "pcm")){ + wav_list.emplace_back(wav_path_); } - if(model_path.find(WAV_SCP)!=model_path.end()){ - ifstream in(model_path.at(WAV_SCP)); + else if(is_target_file(wav_path_, "scp")){ + ifstream in(wav_path_); if (!in.is_open()) { LOG(ERROR) << "Failed to open file: " << model_path.at(WAV_SCP) ; return 0; @@ -110,13 +116,16 @@ int main(int argc, char *argv[]) wav_list.emplace_back(column2); } in.close(); + }else{ + LOG(ERROR)<<"Please check the wav extension!"; + exit(-1); } float snippet_time = 0.0f; long taking_micros = 0; for(auto& wav_file : wav_list){ gettimeofday(&start, NULL); - FUNASR_RESULT result=FsmnVadWavFile(vad_hanlde, wav_file.c_str(), RASR_NONE, NULL); + FUNASR_RESULT result=FsmnVadInfer(vad_hanlde, wav_file.c_str(), FSMN_VAD_OFFLINE, NULL, 16000); gettimeofday(&end, NULL); seconds = (end.tv_sec - start.tv_sec); taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); diff --git a/funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp b/funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp index 343039d7c..347292552 100644 --- a/funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp +++ b/funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp @@ -20,6 +20,15 @@ using namespace std; +bool is_target_file(const std::string& filename, const std::string target) { + std::size_t pos = filename.find_last_of("."); + if (pos == std::string::npos) { + return false; + } + std::string extension = filename.substr(pos + 1); + return (extension == target); +} + void GetValue(TCLAP::ValueArg& value_arg, string key, std::map& model_path) { if (value_arg.isSet()){ @@ -41,8 +50,7 @@ int main(int argc, char** argv) TCLAP::ValueArg punc_dir("", PUNC_DIR, "the punc model path, which contains model.onnx, punc.yaml", false, "", "string"); TCLAP::ValueArg punc_quant("", PUNC_QUANT, "false (Default), load the model of model.onnx in punc_dir. If set true, load the model of model_quant.onnx in punc_dir", false, "false", "string"); - TCLAP::ValueArg wav_path("", WAV_PATH, "wave file path", false, "", "string"); - TCLAP::ValueArg wav_scp("", WAV_SCP, "wave scp path", false, "", "string"); + TCLAP::ValueArg wav_path("", WAV_PATH, "the input could be: wav_path, e.g.: asr_example.wav; pcm_path, e.g.: asr_example.pcm; wav.scp, kaldi style wav list (wav_id \t wav_path)", true, "", "string"); cmd.add(model_dir); cmd.add(quantize); @@ -51,7 +59,6 @@ int main(int argc, char** argv) cmd.add(punc_dir); cmd.add(punc_quant); cmd.add(wav_path); - cmd.add(wav_scp); cmd.parse(argc, argv); std::map model_path; @@ -62,7 +69,6 @@ int main(int argc, char** argv) GetValue(punc_dir, PUNC_DIR, model_path); GetValue(punc_quant, PUNC_QUANT, model_path); GetValue(wav_path, WAV_PATH, model_path); - GetValue(wav_scp, WAV_SCP, model_path); struct timeval start, end; gettimeofday(&start, NULL); @@ -80,14 +86,14 @@ int main(int argc, char** argv) long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s"; - // read wav_path and wav_scp + // read wav_path vector wav_list; - - if(model_path.find(WAV_PATH)!=model_path.end()){ - wav_list.emplace_back(model_path.at(WAV_PATH)); + string wav_path_ = model_path.at(WAV_PATH); + if(is_target_file(wav_path_, "wav") || is_target_file(wav_path_, "pcm")){ + wav_list.emplace_back(wav_path_); } - if(model_path.find(WAV_SCP)!=model_path.end()){ - ifstream in(model_path.at(WAV_SCP)); + else if(is_target_file(wav_path_, "scp")){ + ifstream in(wav_path_); if (!in.is_open()) { LOG(ERROR) << "Failed to open file: " << model_path.at(WAV_SCP) ; return 0; @@ -101,13 +107,16 @@ int main(int argc, char** argv) wav_list.emplace_back(column2); } in.close(); + }else{ + LOG(ERROR)<<"Please check the wav extension!"; + exit(-1); } float snippet_time = 0.0f; long taking_micros = 0; for(auto& wav_file : wav_list){ gettimeofday(&start, NULL); - FUNASR_RESULT result=FunOfflineRecogFile(asr_hanlde, wav_file.c_str(), RASR_NONE, NULL); + FUNASR_RESULT result=FunOfflineInfer(asr_hanlde, wav_file.c_str(), RASR_NONE, NULL, 16000); gettimeofday(&end, NULL); seconds = (end.tv_sec - start.tv_sec); taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec); diff --git a/funasr/runtime/onnxruntime/src/funasrruntime.cpp b/funasr/runtime/onnxruntime/src/funasrruntime.cpp index 893ba70d7..adef5049e 100644 --- a/funasr/runtime/onnxruntime/src/funasrruntime.cpp +++ b/funasr/runtime/onnxruntime/src/funasrruntime.cpp @@ -11,9 +11,9 @@ extern "C" { return mm; } - _FUNASRAPI FUNASR_HANDLE FsmnVadInit(std::map& model_path, int thread_num) + _FUNASRAPI FUNASR_HANDLE FsmnVadInit(std::map& model_path, int thread_num, FSMN_VAD_MODE mode) { - funasr::VadModel* mm = funasr::CreateVadModel(model_path, thread_num); + funasr::VadModel* mm = funasr::CreateVadModel(model_path, thread_num, mode); return mm; } @@ -30,36 +30,7 @@ extern "C" { } // APIs for ASR Infer - _FUNASRAPI FUNASR_RESULT FunASRRecogBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback) - { - funasr::Model* recog_obj = (funasr::Model*)handle; - if (!recog_obj) - return nullptr; - - int32_t sampling_rate = -1; - funasr::Audio audio(1); - if (!audio.LoadWav(sz_buf, n_len, &sampling_rate)) - return nullptr; - - float* buff; - int len; - int flag=0; - funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT; - p_result->snippet_time = audio.GetTimeLen(); - int n_step = 0; - int n_total = audio.GetQueueSize(); - while (audio.Fetch(buff, len, flag) > 0) { - string msg = recog_obj->Forward(buff, len, flag); - p_result->msg += msg; - n_step++; - if (fn_callback) - fn_callback(n_step, n_total); - } - - return p_result; - } - - _FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback) + _FUNASRAPI FUNASR_RESULT FunASRInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate) { funasr::Model* recog_obj = (funasr::Model*)handle; if (!recog_obj) @@ -87,23 +58,32 @@ extern "C" { return p_result; } - _FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* sz_filename, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback) + _FUNASRAPI FUNASR_RESULT FunASRInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate) { funasr::Model* recog_obj = (funasr::Model*)handle; if (!recog_obj) return nullptr; funasr::Audio audio(1); - if (!audio.LoadPcmwav(sz_filename, &sampling_rate)) - return nullptr; + if(funasr::is_target_file(sz_filename, "wav")){ + int32_t sampling_rate_ = -1; + if(!audio.LoadWav(sz_filename, &sampling_rate_)) + return nullptr; + }else if(funasr::is_target_file(sz_filename, "pcm")){ + if (!audio.LoadPcmwav(sz_filename, &sampling_rate)) + return nullptr; + }else{ + LOG(ERROR)<<"Wrong wav extension"; + exit(-1); + } float* buff; int len; int flag = 0; - funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT; - p_result->snippet_time = audio.GetTimeLen(); int n_step = 0; int n_total = audio.GetQueueSize(); + funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT; + p_result->snippet_time = audio.GetTimeLen(); while (audio.Fetch(buff, len, flag) > 0) { string msg = recog_obj->Forward(buff, len, flag); p_result->msg += msg; @@ -115,45 +95,15 @@ extern "C" { return p_result; } - _FUNASRAPI FUNASR_RESULT FunASRRecogFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback) - { - funasr::Model* recog_obj = (funasr::Model*)handle; - if (!recog_obj) - return nullptr; - - int32_t sampling_rate = -1; - funasr::Audio audio(1); - if(!audio.LoadWav(sz_wavfile, &sampling_rate)) - return nullptr; - - float* buff; - int len; - int flag = 0; - int n_step = 0; - int n_total = audio.GetQueueSize(); - funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT; - p_result->snippet_time = audio.GetTimeLen(); - while (audio.Fetch(buff, len, flag) > 0) { - string msg = recog_obj->Forward(buff, len, flag); - p_result->msg+= msg; - n_step++; - if (fn_callback) - fn_callback(n_step, n_total); - } - - return p_result; - } - // APIs for VAD Infer - _FUNASRAPI FUNASR_RESULT FsmnVadWavFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback) + _FUNASRAPI FUNASR_RESULT FsmnVadInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate) { funasr::VadModel* vad_obj = (funasr::VadModel*)handle; if (!vad_obj) return nullptr; - - int32_t sampling_rate = -1; + funasr::Audio audio(1); - if(!audio.LoadWav(sz_wavfile, &sampling_rate)) + if (!audio.LoadPcmwav(sz_buf, n_len, &sampling_rate)) return nullptr; funasr::FUNASR_VAD_RESULT* p_result = new funasr::FUNASR_VAD_RESULT; @@ -166,6 +116,35 @@ extern "C" { return p_result; } + _FUNASRAPI FUNASR_RESULT FsmnVadInfer(FUNASR_HANDLE handle, const char* sz_filename, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate) + { + funasr::VadModel* vad_obj = (funasr::VadModel*)handle; + if (!vad_obj) + return nullptr; + + funasr::Audio audio(1); + if(funasr::is_target_file(sz_filename, "wav")){ + int32_t sampling_rate_ = -1; + if(!audio.LoadWav(sz_filename, &sampling_rate_)) + return nullptr; + }else if(funasr::is_target_file(sz_filename, "pcm")){ + if (!audio.LoadPcmwav(sz_filename, &sampling_rate)) + return nullptr; + }else{ + LOG(ERROR)<<"Wrong wav extension"; + exit(-1); + } + + funasr::FUNASR_VAD_RESULT* p_result = new funasr::FUNASR_VAD_RESULT; + p_result->snippet_time = audio.GetTimeLen(); + + vector> vad_segments; + audio.Split(vad_obj, vad_segments); + p_result->segments = new vector>(vad_segments); + + return p_result; + } + // APIs for PUNC Infer _FUNASRAPI const std::string CTTransformerInfer(FUNASR_HANDLE handle, const char* sz_sentence, FUNASR_MODE mode, QM_CALLBACK fn_callback) { @@ -178,43 +157,7 @@ extern "C" { } // APIs for Offline-stream Infer - _FUNASRAPI FUNASR_RESULT FunOfflineRecogFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback) - { - funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle; - if (!offline_stream) - return nullptr; - - int32_t sampling_rate = -1; - funasr::Audio audio(1); - if(!audio.LoadWav(sz_wavfile, &sampling_rate)) - return nullptr; - if(offline_stream->UseVad()){ - audio.Split(offline_stream); - } - - float* buff; - int len; - int flag = 0; - int n_step = 0; - int n_total = audio.GetQueueSize(); - funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT; - p_result->snippet_time = audio.GetTimeLen(); - while (audio.Fetch(buff, len, flag) > 0) { - string msg = (offline_stream->asr_handle)->Forward(buff, len, flag); - p_result->msg+= msg; - n_step++; - if (fn_callback) - fn_callback(n_step, n_total); - } - if(offline_stream->UsePunc()){ - string punc_res = (offline_stream->punc_handle)->AddPunc((p_result->msg).c_str()); - p_result->msg = punc_res; - } - - return p_result; - } - - _FUNASRAPI FUNASR_RESULT FunOfflineRecogPCMBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback) + _FUNASRAPI FUNASR_RESULT FunOfflineInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate) { funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle; if (!offline_stream) @@ -249,6 +192,50 @@ extern "C" { return p_result; } + _FUNASRAPI FUNASR_RESULT FunOfflineInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate) + { + funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle; + if (!offline_stream) + return nullptr; + + funasr::Audio audio(1); + if(funasr::is_target_file(sz_filename, "wav")){ + int32_t sampling_rate_ = -1; + if(!audio.LoadWav(sz_filename, &sampling_rate_)) + return nullptr; + }else if(funasr::is_target_file(sz_filename, "pcm")){ + if (!audio.LoadPcmwav(sz_filename, &sampling_rate)) + return nullptr; + }else{ + LOG(ERROR)<<"Wrong wav extension"; + exit(-1); + } + if(offline_stream->UseVad()){ + audio.Split(offline_stream); + } + + float* buff; + int len; + int flag = 0; + int n_step = 0; + int n_total = audio.GetQueueSize(); + funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT; + p_result->snippet_time = audio.GetTimeLen(); + while (audio.Fetch(buff, len, flag) > 0) { + string msg = (offline_stream->asr_handle)->Forward(buff, len, flag); + p_result->msg+= msg; + n_step++; + if (fn_callback) + fn_callback(n_step, n_total); + } + if(offline_stream->UsePunc()){ + string punc_res = (offline_stream->punc_handle)->AddPunc((p_result->msg).c_str()); + p_result->msg = punc_res; + } + + return p_result; + } + _FUNASRAPI const int FunASRGetRetNumber(FUNASR_RESULT result) { if (!result) diff --git a/funasr/runtime/onnxruntime/src/util.cpp b/funasr/runtime/onnxruntime/src/util.cpp index d29c5c0f9..755913ce6 100644 --- a/funasr/runtime/onnxruntime/src/util.cpp +++ b/funasr/runtime/onnxruntime/src/util.cpp @@ -180,4 +180,13 @@ void Glu(Tensor *din, Tensor *dout) } } +bool is_target_file(const std::string& filename, const std::string target) { + std::size_t pos = filename.find_last_of("."); + if (pos == std::string::npos) { + return false; + } + std::string extension = filename.substr(pos + 1); + return (extension == target); +} + } // namespace funasr \ No newline at end of file diff --git a/funasr/runtime/onnxruntime/src/util.h b/funasr/runtime/onnxruntime/src/util.h index 95ef4586a..8823a32ee 100644 --- a/funasr/runtime/onnxruntime/src/util.h +++ b/funasr/runtime/onnxruntime/src/util.h @@ -25,6 +25,7 @@ extern void FindMax(float *din, int len, float &max_val, int &max_idx); extern void Glu(Tensor *din, Tensor *dout); string PathAppend(const string &p1, const string &p2); +bool is_target_file(const std::string& filename, const std::string target); } // namespace funasr #endif diff --git a/funasr/runtime/onnxruntime/src/vad-model.cpp b/funasr/runtime/onnxruntime/src/vad-model.cpp index 764db00c9..336758f87 100644 --- a/funasr/runtime/onnxruntime/src/vad-model.cpp +++ b/funasr/runtime/onnxruntime/src/vad-model.cpp @@ -1,10 +1,14 @@ #include "precomp.h" namespace funasr { -VadModel *CreateVadModel(std::map& model_path, int thread_num) +VadModel *CreateVadModel(std::map& model_path, int thread_num, int mode) { VadModel *mm; - mm = new FsmnVad(); + if(mode == FSMN_VAD_OFFLINE){ + mm = new FsmnVad(); + }else{ + LOG(ERROR)<<"Online fsmn vad not imp!"; + } string vad_model_path; string vad_cmvn_path; diff --git a/funasr/runtime/websocket/websocketsrv.cpp b/funasr/runtime/websocket/websocketsrv.cpp index 9e566677b..1a6adbff2 100644 --- a/funasr/runtime/websocket/websocketsrv.cpp +++ b/funasr/runtime/websocket/websocketsrv.cpp @@ -25,8 +25,8 @@ void WebSocketServer::do_decoder(const std::vector& buffer, if (!buffer.empty()) { // fout.write(buffer.data(), buffer.size()); // feed data to asr engine - FUNASR_RESULT Result = FunOfflineRecogPCMBuffer( - asr_hanlde, buffer.data(), buffer.size(), 16000, RASR_NONE, NULL); + FUNASR_RESULT Result = FunOfflineInferBuffer( + asr_hanlde, buffer.data(), buffer.size(), RASR_NONE, NULL, 16000); std::string asr_result = ((FUNASR_RECOG_RESULT*)Result)->msg; // get decode result