Merge branch 'main' into dev_cmz2

This commit is contained in:
zhifu gao 2023-04-07 15:54:09 +08:00 committed by GitHub
commit 2e769fb36c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
51 changed files with 887 additions and 132 deletions

View File

@ -0,0 +1,53 @@
import argparse
import json
import numpy as np
def get_parser():
parser = argparse.ArgumentParser(
description="cmvn converter",
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"--cmvn-json",
"-c",
default=False,
required=True,
type=str,
help="cmvn json file",
)
parser.add_argument(
"--am-mvn",
"-a",
default=False,
required=True,
type=str,
help="am mvn file",
)
return parser
def main():
parser = get_parser()
args = parser.parse_args()
with open(args.cmvn_json, "r") as fin:
cmvn_dict = json.load(fin)
mean_stats = np.array(cmvn_dict["mean_stats"])
var_stats = np.array(cmvn_dict["var_stats"])
total_frame = np.array(cmvn_dict["total_frames"])
mean = -1.0 * mean_stats / total_frame
var = 1.0 / np.sqrt(var_stats / total_frame - mean * mean)
dims = mean.shape[0]
with open(args.am_mvn, 'w') as fout:
fout.write("<Nnet>" + "\n" + "<Splice> " + str(dims) + " " + str(dims) + '\n' + "[ 0 ]" + "\n" + "<AddShift> " + str(dims) + " " + str(dims) + "\n")
mean_str = str(list(mean)).replace(',', '').replace('[', '[ ').replace(']', ' ]')
fout.write("<LearnRateCoef> 0 " + mean_str + '\n')
fout.write("<Rescale> " + str(dims) + " " + str(dims) + '\n')
var_str = str(list(var)).replace(',', '').replace('[', '[ ').replace(']', ' ]')
fout.write("<LearnRateCoef> 0 " + var_str + '\n')
fout.write("</Nnet>" + '\n')
if __name__ == '__main__':
main()

View File

@ -45,8 +45,8 @@ def compute_wer(ref_file,
if out_item['wrong'] > 0:
rst['wrong_sentences'] += 1
cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
cer_detail_writer.write("ref:" + '\t' + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + '\n')
cer_detail_writer.write("hyp:" + '\t' + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + '\n')
if rst['Wrd'] > 0:
rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)

View File

@ -0,0 +1,6 @@
beam_size: 10
penalty: 0.0
maxlenratio: 0.0
minlenratio: 0.0
ctc_weight: 0.5
lm_weight: 0.7

View File

@ -0,0 +1,80 @@
encoder: conformer
encoder_conf:
output_size: 512
attention_heads: 8
linear_units: 2048
num_blocks: 12
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d
normalize_before: true
macaron_style: true
rel_pos_type: latest
pos_enc_layer_type: rel_pos
selfattention_layer_type: rel_selfattn
activation_type: swish
use_cnn_module: true
cnn_module_kernel: 31
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1
length_normalized_loss: false
accum_grad: 2
max_epoch: 50
patience: none
init: none
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 10
optim: adam
optim_conf:
lr: 0.0025
weight_decay: 0.000001
scheduler: warmuplr
scheduler_conf:
warmup_steps: 40000
specaug: specaug
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 27
num_freq_mask: 2
apply_time_mask: true
time_mask_width_ratio_range:
- 0.
- 0.05
num_time_mask: 10
dataset_conf:
shuffle: True
shuffle_conf:
shuffle_size: 1024
sort_size: 500
batch_conf:
batch_type: token
batch_size: 10000
num_workers: 8
log_interval: 50
normalize: None

View File

@ -0,0 +1,80 @@
encoder: conformer
encoder_conf:
output_size: 512
attention_heads: 8
linear_units: 2048
num_blocks: 12
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: conv2d
normalize_before: true
macaron_style: true
rel_pos_type: latest
pos_enc_layer_type: rel_pos
selfattention_layer_type: rel_selfattn
activation_type: swish
use_cnn_module: true
cnn_module_kernel: 31
decoder: transformer
decoder_conf:
attention_heads: 8
linear_units: 2048
num_blocks: 6
dropout_rate: 0.1
positional_dropout_rate: 0.1
self_attention_dropout_rate: 0.1
src_attention_dropout_rate: 0.1
model_conf:
ctc_weight: 0.3
lsm_weight: 0.1
length_normalized_loss: false
accum_grad: 2
max_epoch: 50
patience: none
init: none
best_model_criterion:
- - valid
- acc
- max
keep_nbest_models: 10
optim: adam
optim_conf:
lr: 0.0025
weight_decay: 0.000001
scheduler: warmuplr
scheduler_conf:
warmup_steps: 40000
specaug: specaug
specaug_conf:
apply_time_warp: true
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 27
num_freq_mask: 2
apply_time_mask: true
time_mask_width_ratio_range:
- 0.
- 0.05
num_time_mask: 10
dataset_conf:
shuffle: True
shuffle_conf:
shuffle_size: 1024
sort_size: 500
batch_conf:
batch_type: token
batch_size: 10000
num_workers: 8
log_interval: 50
normalize: utterance_mvn

View File

@ -0,0 +1,58 @@
#!/usr/bin/env bash
# Copyright 2014 Vassil Panayotov
# 2014 Johns Hopkins University (author: Daniel Povey)
# Apache 2.0
if [ "$#" -ne 2 ]; then
echo "Usage: $0 <src-dir> <dst-dir>"
echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
exit 1
fi
src=$1
dst=$2
# all utterances are FLAC compressed
if ! which flac >&/dev/null; then
echo "Please install 'flac' on ALL worker nodes!"
exit 1
fi
spk_file=$src/../SPEAKERS.TXT
mkdir -p $dst || exit 1
[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1
wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
trans=$dst/text; [[ -f "$trans" ]] && rm $trans
for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
reader=$(basename $reader_dir)
if ! [ $reader -eq $reader ]; then # not integer.
echo "$0: unexpected subdirectory name $reader"
exit 1
fi
for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
chapter=$(basename $chapter_dir)
if ! [ "$chapter" -eq "$chapter" ]; then
echo "$0: unexpected chapter-subdirectory name $chapter"
exit 1
fi
find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac \n", $0, dir, $0}' >>$wav_scp|| exit 1
chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
[ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
cat $chapter_trans >>$trans
done
done
echo "$0: successfully prepared data in $dst"
exit 0

View File

@ -0,0 +1,5 @@
export FUNASR_DIR=$PWD/../../..
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
export PYTHONIOENCODING=UTF-8
export PATH=$FUNASR_DIR/funasr/bin:$PATH

262
egs/librispeech/conformer/run.sh Executable file
View File

@ -0,0 +1,262 @@
#!/usr/bin/env bash
. ./path.sh || exit 1;
# machines configuration
CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
gpu_num=8
count=1
gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding
# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
njob=5
train_cmd=utils/run.pl
infer_cmd=utils/run.pl
# general configuration
feats_dir="../DATA" #feature output dictionary
exp_dir="."
lang=en
dumpdir=dump/fbank
feats_type=fbank
token_type=bpe
dataset_type=large
scp=feats.scp
type=kaldi_ark
stage=3
stop_stage=4
# feature configuration
feats_dim=80
sample_frequency=16000
nj=100
speed_perturb="0.9,1.0,1.1"
# data
data_librispeech=
# bpe model
nbpe=5000
bpemode=unigram
# exp tag
tag=""
. utils/parse_options.sh || exit 1;
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
train_set=train_960
valid_set=dev
test_sets="test_clean test_other dev_clean dev_other"
asr_config=conf/train_asr_conformer.yaml
#asr_config=conf/train_asr_conformer_uttnorm.yaml
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer.yaml
#inference_config=conf/decode_asr_transformer_beam60_ctc0.3.yaml
inference_asr_model=valid.acc.ave_10best.pth
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
if ${gpu_inference}; then
inference_nj=$[${ngpu}*${njob}]
_ngpu=1
else
inference_nj=$njob
_ngpu=0
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "stage 0: Data preparation"
# Data preparation
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
local/data_prep_librispeech.sh ${data_librispeech}/LibriSpeech/${x} ${feats_dir}/data/${x//-/_}
done
fi
feat_train_dir=${feats_dir}/${dumpdir}/$train_set; mkdir -p ${feat_train_dir}
feat_dev_clean_dir=${feats_dir}/${dumpdir}/dev_clean; mkdir -p ${feat_dev_clean_dir}
feat_dev_other_dir=${feats_dir}/${dumpdir}/dev_other; mkdir -p ${feat_dev_other_dir}
feat_test_clean_dir=${feats_dir}/${dumpdir}/test_clean; mkdir -p ${feat_test_clean_dir}
feat_test_other_dir=${feats_dir}/${dumpdir}/test_other; mkdir -p ${feat_test_other_dir}
feat_dev_dir=${feats_dir}/${dumpdir}/$valid_set; mkdir -p ${feat_dev_dir}
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: Feature Generation"
# compute fbank features
fbankdir=${feats_dir}/fbank
for x in dev_clean dev_other test_clean test_other; do
utils/compute_fbank.sh --cmd "$train_cmd" --nj 1 --max_lengths 3000 --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} \
${feats_dir}/data/${x} ${exp_dir}/exp/make_fbank/${x} ${fbankdir}/${x}
utils/fix_data_feat.sh ${fbankdir}/${x}
done
mkdir ${feats_dir}/data/$train_set
train_sets="train_clean_100 train_clean_360 train_other_500"
for file in wav.scp text; do
( for f in $train_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$train_set/$file || exit 1;
done
utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --max_lengths 3000 --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} --speed_perturb ${speed_perturb} \
${feats_dir}/data/$train_set ${exp_dir}/exp/make_fbank/$train_set ${fbankdir}/$train_set
utils/fix_data_feat.sh ${fbankdir}/$train_set
# compute global cmvn
utils/compute_cmvn.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} \
${fbankdir}/$train_set ${exp_dir}/exp/make_fbank/$train_set
# apply cmvn
utils/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \
${fbankdir}/$train_set ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/$train_set ${feat_train_dir}
utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
${fbankdir}/dev_clean ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/dev_clean ${feat_dev_clean_dir}
utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1\
${fbankdir}/dev_other ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/dev_other ${feat_dev_other_dir}
utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
${fbankdir}/test_clean ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/test_clean ${feat_test_clean_dir}
utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
${fbankdir}/test_other ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/test_other ${feat_test_other_dir}
cp ${fbankdir}/$train_set/text ${fbankdir}/$train_set/speech_shape ${fbankdir}/$train_set/text_shape ${feat_train_dir}
cp ${fbankdir}/dev_clean/text ${fbankdir}/dev_clean/speech_shape ${fbankdir}/dev_clean/text_shape ${feat_dev_clean_dir}
cp ${fbankdir}/dev_other/text ${fbankdir}/dev_other/speech_shape ${fbankdir}/dev_other/text_shape ${feat_dev_other_dir}
cp ${fbankdir}/test_clean/text ${fbankdir}/test_clean/speech_shape ${fbankdir}/test_clean/text_shape ${feat_test_clean_dir}
cp ${fbankdir}/test_other/text ${fbankdir}/test_other/speech_shape ${fbankdir}/test_other/text_shape ${feat_test_other_dir}
dev_sets="dev_clean dev_other"
for file in feats.scp text speech_shape text_shape; do
( for f in $dev_sets; do cat $feats_dir/${dumpdir}/$f/$file; done ) | sort -k1 > $feat_dev_dir/$file || exit 1;
done
#generate ark list
utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_train_dir} ${fbankdir}/${train_set} ${feat_train_dir}
utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_dev_dir} ${fbankdir}/${valid_set} ${feat_dev_dir}
fi
dict=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
bpemodel=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}
echo "dictionary: ${dict}"
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
### Task dependent. You have to check non-linguistic symbols used in the corpus.
echo "stage 2: Dictionary and Json Data Preparation"
mkdir -p ${feats_dir}/data/lang_char/
echo "<blank>" > ${dict}
echo "<s>" >> ${dict}
echo "</s>" >> ${dict}
cut -f 2- -d" " ${feats_dir}/data/${train_set}/text > ${feats_dir}/data/lang_char/input.txt
spm_train --input=${feats_dir}/data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
spm_encode --model=${bpemodel}.model --output_format=piece < ${feats_dir}/data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0}' >> ${dict}
echo "<unk>" >> ${dict}
wc -l ${dict}
vocab_size=$(cat ${dict} | wc -l)
awk -v v=,${vocab_size} '{print $0v}' ${feat_train_dir}/text_shape > ${feat_train_dir}/text_shape.char
awk -v v=,${vocab_size} '{print $0v}' ${feat_dev_dir}/text_shape > ${feat_dev_dir}/text_shape.char
mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/$train_set
mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/$valid_set
cp ${feat_train_dir}/speech_shape ${feat_train_dir}/text_shape ${feat_train_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/$train_set
cp ${feat_dev_dir}/speech_shape ${feat_dev_dir}/text_shape ${feat_dev_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/$valid_set
fi
# Training Stage
world_size=$gpu_num # run on one machine
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
echo "stage 3: Training"
mkdir -p ${exp_dir}/exp/${model_dir}
mkdir -p ${exp_dir}/exp/${model_dir}/log
INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init
if [ -f $INIT_FILE ];then
rm -f $INIT_FILE
fi
init_method=file://$(readlink -f $INIT_FILE)
echo "$0: init method is $init_method"
for ((i = 0; i < $gpu_num; ++i)); do
{
rank=$i
local_rank=$i
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
asr_train.py \
--gpu_id $gpu_id \
--use_preprocessor true \
--split_with_space false \
--bpemodel ${bpemodel}.model \
--token_type $token_type \
--dataset_type $dataset_type \
--token_list $dict \
--train_data_file $feats_dir/$dumpdir/${train_set}/ark_txt.scp \
--valid_data_file $feats_dir/$dumpdir/${valid_set}/ark_txt.scp \
--resume true \
--output_dir ${exp_dir}/exp/${model_dir} \
--config $asr_config \
--input_size $feats_dim \
--ngpu $gpu_num \
--num_worker_count $count \
--multiprocessing_distributed true \
--dist_init_method $init_method \
--dist_world_size $world_size \
--dist_rank $rank \
--local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
} &
done
wait
fi
# Testing Stage
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
echo "stage 4: Inference"
for dset in ${test_sets}; do
asr_exp=${exp_dir}/exp/${model_dir}
inference_tag="$(basename "${inference_config}" .yaml)"
_dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
_logdir="${_dir}/logdir"
if [ -d ${_dir} ]; then
echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
exit 0
fi
mkdir -p "${_logdir}"
_data="${feats_dir}/${dumpdir}/${dset}"
key_file=${_data}/${scp}
num_scp_file="$(<${key_file} wc -l)"
_nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file")
split_scps=
for n in $(seq "${_nj}"); do
split_scps+=" ${_logdir}/keys.${n}.scp"
done
# shellcheck disable=SC2086
utils/split_scp.pl "${key_file}" ${split_scps}
_opts=
if [ -n "${inference_config}" ]; then
_opts+="--config ${inference_config} "
fi
${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
python -m funasr.bin.asr_inference_launch \
--batch_size 1 \
--ngpu "${_ngpu}" \
--njob ${njob} \
--gpuid_list ${gpuid_list} \
--data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
--key_file "${_logdir}"/keys.JOB.scp \
--asr_train_config "${asr_exp}"/config.yaml \
--asr_model_file "${asr_exp}"/"${inference_asr_model}" \
--output_dir "${_logdir}"/output.JOB \
--mode asr \
${_opts}
for f in token token_int score text; do
if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
for i in $(seq "${_nj}"); do
cat "${_logdir}/output.${i}/1best_recog/${f}"
done | sort -k1 >"${_dir}/${f}"
fi
done
python utils/compute_wer.py ${_data}/text ${_dir}/text ${_dir}/text.cer
tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
cat ${_dir}/text.cer.txt
done
fi

View File

@ -0,0 +1 @@
../../aishell/transformer/utils

View File

@ -74,7 +74,7 @@ def modelscope_infer(params):
# If text exists, compute CER
text_in = os.path.join(params["data_dir"], "text")
if os.path.exists(text_in):
text_proc_file = os.path.join(best_recog_path, "token")
text_proc_file = os.path.join(best_recog_path, "text")
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))

View File

@ -38,7 +38,7 @@ def modelscope_infer_after_finetune(params):
# computer CER if GT text is set
text_in = os.path.join(params["data_dir"], "text")
if os.path.exists(text_in):
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))

View File

@ -74,7 +74,7 @@ def modelscope_infer(params):
# If text exists, compute CER
text_in = os.path.join(params["data_dir"], "text")
if os.path.exists(text_in):
text_proc_file = os.path.join(best_recog_path, "token")
text_proc_file = os.path.join(best_recog_path, "text")
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))

View File

@ -38,7 +38,7 @@ def modelscope_infer_after_finetune(params):
# computer CER if GT text is set
text_in = os.path.join(params["data_dir"], "text")
if os.path.exists(text_in):
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))

View File

@ -17,7 +17,7 @@ def modelscope_infer(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
parser.add_argument('--audio_in', type=str, default="./data/test")
parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
parser.add_argument('--output_dir', type=str, default="./results/")
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--gpuid', type=str, default="0")

View File

@ -63,8 +63,8 @@ fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
echo "Computing WER ..."
python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
tail -n 3 ${output_dir}/1best_recog/text.cer
fi

View File

@ -34,7 +34,7 @@ def modelscope_infer_after_finetune(params):
# computer CER if GT text is set
text_in = os.path.join(params["data_dir"], "text")
if os.path.exists(text_in):
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))

View File

@ -17,7 +17,7 @@ def modelscope_infer(args):
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--model', type=str, default="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1")
parser.add_argument('--audio_in', type=str, default="./data/test")
parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
parser.add_argument('--output_dir', type=str, default="./results/")
parser.add_argument('--batch_size', type=int, default=64)
parser.add_argument('--gpuid', type=str, default="0")

View File

@ -63,8 +63,8 @@ fi
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
echo "Computing WER ..."
python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
tail -n 3 ${output_dir}/1best_recog/text.cer
fi

View File

@ -34,7 +34,7 @@ def modelscope_infer_after_finetune(params):
# computer CER if GT text is set
text_in = os.path.join(params["data_dir"], "text")
if os.path.exists(text_in):
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))

View File

@ -75,7 +75,7 @@ def modelscope_infer(params):
# If text exists, compute CER
text_in = os.path.join(params["data_dir"], "text")
if os.path.exists(text_in):
text_proc_file = os.path.join(best_recog_path, "token")
text_proc_file = os.path.join(best_recog_path, "text")
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))

View File

@ -39,7 +39,7 @@ def modelscope_infer_after_finetune(params):
# computer CER if GT text is set
text_in = os.path.join(params["data_dir"], "text")
if os.path.exists(text_in):
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))

View File

@ -75,7 +75,7 @@ def modelscope_infer(params):
# If text exists, compute CER
text_in = os.path.join(params["data_dir"], "text")
if os.path.exists(text_in):
text_proc_file = os.path.join(best_recog_path, "token")
text_proc_file = os.path.join(best_recog_path, "text")
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))

View File

@ -39,7 +39,7 @@ def modelscope_infer_after_finetune(params):
# computer CER if GT text is set
text_in = os.path.join(params["data_dir"], "text")
if os.path.exists(text_in):
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))

View File

@ -797,7 +797,7 @@ def inference_modelscope(
finish_count += 1
# asr_utils.print_progress(finish_count / file_count)
if writer is not None:
ibest_writer["text"][key] = text_postprocessed
ibest_writer["text"][key] = " ".join(word_lists)
logging.info("decoding, utt: {}, predictions: {}".format(key, text))
rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))

View File

@ -42,6 +42,7 @@ from funasr.utils import asr_utils, wav_utils, postprocess_utils
from funasr.models.frontend.wav_frontend import WavFrontend
from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
np.set_printoptions(threshold=np.inf)
class Speech2Text:
"""Speech2Text class
@ -203,7 +204,6 @@ class Speech2Text:
# Input as audio signal
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
if self.frontend is not None:
feats, feats_len = self.frontend.forward(speech, speech_lengths)
feats = to_device(feats, device=self.device)
@ -213,13 +213,16 @@ class Speech2Text:
feats = speech
feats_len = speech_lengths
lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
feats_len = cache["encoder"]["stride"] + cache["encoder"]["pad_left"] + cache["encoder"]["pad_right"]
feats = feats[:,cache["encoder"]["start_idx"]:cache["encoder"]["start_idx"]+feats_len,:]
feats_len = torch.tensor([feats_len])
batch = {"speech": feats, "speech_lengths": feats_len, "cache": cache}
# a. To device
batch = to_device(batch, device=self.device)
# b. Forward Encoder
enc, enc_len = self.asr_model.encode_chunk(**batch)
enc, enc_len = self.asr_model.encode_chunk(feats, feats_len, cache)
if isinstance(enc, tuple):
enc = enc[0]
# assert len(enc) == 1, len(enc)
@ -578,7 +581,22 @@ def inference_modelscope(
speech2text = Speech2TextExport(**speech2text_kwargs)
else:
speech2text = Speech2Text(**speech2text_kwargs)
def _load_bytes(input):
middle_data = np.frombuffer(input, dtype=np.int16)
middle_data = np.asarray(middle_data)
if middle_data.dtype.kind not in 'iu':
raise TypeError("'middle_data' must be an array of integers")
dtype = np.dtype('float32')
if dtype.kind != 'f':
raise TypeError("'dtype' must be a floating point type")
i = np.iinfo(middle_data.dtype)
abs_max = 2 ** (i.bits - 1)
offset = i.min + abs_max
array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
return array
def _forward(
data_path_and_name_and_type,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
@ -589,10 +607,12 @@ def inference_modelscope(
):
# 3. Build data-iterator
if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "bytes":
raw_inputs = _load_bytes(data_path_and_name_and_type[0])
raw_inputs = torch.tensor(raw_inputs)
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, np.ndarray):
raw_inputs = torch.tensor(raw_inputs)
is_final = False
if param_dict is not None and "cache" in param_dict:
cache = param_dict["cache"]
@ -605,62 +625,87 @@ def inference_modelscope(
asr_result = ""
wait = True
if len(cache) == 0:
cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None, "is_final": is_final, "left": 0, "right": 0}
cache_de = {"decode_fsmn": None}
cache["decoder"] = cache_de
cache["first_chunk"] = True
cache["speech"] = []
cache["chunk_index"] = 0
cache["speech_chunk"] = []
cache["accum_speech"] = 0
if raw_inputs is not None:
if len(cache["speech"]) == 0:
cache["speech"] = raw_inputs
else:
cache["speech"] = torch.cat([cache["speech"], raw_inputs], dim=0)
if len(cache["speech_chunk"]) == 0:
cache["speech_chunk"] = raw_inputs
else:
cache["speech_chunk"] = torch.cat([cache["speech_chunk"], raw_inputs], dim=0)
while len(cache["speech_chunk"]) >= 960:
cache["accum_speech"] += len(raw_inputs)
while cache["accum_speech"] >= 960:
if cache["first_chunk"]:
if len(cache["speech_chunk"]) >= 14400:
speech = torch.unsqueeze(cache["speech_chunk"][0:14400], axis=0)
speech_length = torch.tensor([14400])
if cache["accum_speech"] >= 14400:
speech = torch.unsqueeze(cache["speech"], axis=0)
speech_length = torch.tensor([len(cache["speech"])])
cache["encoder"]["pad_left"] = 5
cache["encoder"]["pad_right"] = 5
cache["encoder"]["stride"] = 10
cache["encoder"]["left"] = 5
cache["encoder"]["right"] = 0
results = speech2text(cache, speech, speech_length)
cache["speech_chunk"]= cache["speech_chunk"][4800:]
cache["accum_speech"] -= 4800
cache["first_chunk"] = False
cache["encoder"]["start_idx"] = -5
cache["encoder"]["is_final"] = False
wait = False
else:
if is_final:
cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
cache["encoder"]["stride"] = len(cache["speech"]) // 960
cache["encoder"]["pad_left"] = 0
cache["encoder"]["pad_right"] = 0
speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
speech_length = torch.tensor([len(cache["speech_chunk"])])
speech = torch.unsqueeze(cache["speech"], axis=0)
speech_length = torch.tensor([len(cache["speech"])])
results = speech2text(cache, speech, speech_length)
cache["speech_chunk"] = []
cache["accum_speech"] = 0
wait = False
else:
break
else:
if len(cache["speech_chunk"]) >= 19200:
if cache["accum_speech"] >= 19200:
cache["encoder"]["start_idx"] += 10
cache["encoder"]["stride"] = 10
cache["encoder"]["pad_left"] = 5
speech = torch.unsqueeze(cache["speech_chunk"][:19200], axis=0)
speech_length = torch.tensor([19200])
cache["encoder"]["pad_right"] = 5
cache["encoder"]["left"] = 0
cache["encoder"]["right"] = 0
speech = torch.unsqueeze(cache["speech"], axis=0)
speech_length = torch.tensor([len(cache["speech"])])
results = speech2text(cache, speech, speech_length)
cache["speech_chunk"] = cache["speech_chunk"][9600:]
cache["accum_speech"] -= 9600
wait = False
else:
if is_final:
cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
cache["encoder"]["pad_right"] = 0
speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
speech_length = torch.tensor([len(cache["speech_chunk"])])
results = speech2text(cache, speech, speech_length)
cache["speech_chunk"] = []
wait = False
cache["encoder"]["is_final"] = True
if cache["accum_speech"] >= 14400:
cache["encoder"]["start_idx"] += 10
cache["encoder"]["stride"] = 10
cache["encoder"]["pad_left"] = 5
cache["encoder"]["pad_right"] = 5
cache["encoder"]["left"] = 0
cache["encoder"]["right"] = cache["accum_speech"] // 960 - 15
speech = torch.unsqueeze(cache["speech"], axis=0)
speech_length = torch.tensor([len(cache["speech"])])
results = speech2text(cache, speech, speech_length)
cache["accum_speech"] -= 9600
wait = False
else:
cache["encoder"]["start_idx"] += 10
cache["encoder"]["stride"] = cache["accum_speech"] // 960 - 5
cache["encoder"]["pad_left"] = 5
cache["encoder"]["pad_right"] = 0
cache["encoder"]["left"] = 0
cache["encoder"]["right"] = 0
speech = torch.unsqueeze(cache["speech"], axis=0)
speech_length = torch.tensor([len(cache["speech"])])
results = speech2text(cache, speech, speech_length)
cache["accum_speech"] = 0
wait = False
else:
break

View File

@ -338,7 +338,7 @@ def inference_modelscope(
ibest_writer["token"][key] = " ".join(token)
ibest_writer["token_int"][key] = " ".join(map(str, token_int))
ibest_writer["vad"][key] = "{}".format(vadsegments)
ibest_writer["text"][key] = text_postprocessed
ibest_writer["text"][key] = " ".join(word_lists)
ibest_writer["text_with_punc"][key] = text_postprocessed_punc
if time_stamp_postprocessed is not None:
ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)

View File

@ -670,7 +670,7 @@ def inference_modelscope(
ibest_writer["token"][key] = " ".join(token)
ibest_writer["token_int"][key] = " ".join(map(str, token_int))
ibest_writer["vad"][key] = "{}".format(vadsegments)
ibest_writer["text"][key] = text_postprocessed
ibest_writer["text"][key] = " ".join(word_lists)
ibest_writer["text_with_punc"][key] = text_postprocessed_punc
if time_stamp_postprocessed is not None:
ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)

View File

@ -738,13 +738,13 @@ def inference_modelscope(
ibest_writer["rtf"][key] = rtf_cur
if text is not None:
text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
item = {'key': key, 'value': text_postprocessed}
asr_result_list.append(item)
finish_count += 1
# asr_utils.print_progress(finish_count / file_count)
if writer is not None:
ibest_writer["text"][key] = text_postprocessed
ibest_writer["text"][key] = " ".join(word_lists)
logging.info("decoding, utt: {}, predictions: {}".format(key, text))
rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))

View File

@ -504,13 +504,13 @@ def inference_modelscope(
ibest_writer["score"][key] = str(hyp.score)
if text is not None:
text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
item = {'key': key, 'value': text_postprocessed}
asr_result_list.append(item)
finish_count += 1
asr_utils.print_progress(finish_count / file_count)
if writer is not None:
ibest_writer["text"][key] = text_postprocessed
ibest_writer["text"][key] = " ".join(word_lists)
return asr_result_list
return _forward

View File

@ -507,13 +507,13 @@ def inference_modelscope(
ibest_writer["score"][key] = str(hyp.score)
if text is not None:
text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
item = {'key': key, 'value': text_postprocessed}
asr_result_list.append(item)
finish_count += 1
asr_utils.print_progress(finish_count / file_count)
if writer is not None:
ibest_writer["text"][key] = text_postprocessed
ibest_writer["text"][key] = " ".join(word_lists)
return asr_result_list
return _forward

View File

@ -37,7 +37,7 @@ def tokenize(data,
vad = -2
if bpe_tokenizer is not None:
text = bpe_tokenizer.text2tokens(text)
text = bpe_tokenizer.text2tokens("".join(text))
if seg_dict is not None:
assert isinstance(seg_dict, dict)

View File

@ -19,6 +19,7 @@ class ModelExport:
self,
cache_dir: Union[Path, str] = None,
onnx: bool = True,
device: str = "cpu",
quant: bool = True,
fallback_num: int = 0,
audio_in: str = None,
@ -36,6 +37,7 @@ class ModelExport:
)
print("output dir: {}".format(self.cache_dir))
self.onnx = onnx
self.device = device
self.quant = quant
self.fallback_num = fallback_num
self.frontend = None
@ -112,6 +114,10 @@ class ModelExport:
else:
dummy_input = model.get_dummy_inputs()
if self.device == 'cuda':
model = model.cuda()
dummy_input = tuple([i.cuda() for i in dummy_input])
# model_script = torch.jit.script(model)
model_script = torch.jit.trace(model, dummy_input)
model_script.save(os.path.join(path, f'{model.model_name}.torchscripts'))
@ -260,6 +266,7 @@ if __name__ == '__main__':
parser.add_argument('--model-name', type=str, required=True)
parser.add_argument('--export-dir', type=str, required=True)
parser.add_argument('--type', type=str, default='onnx', help='["onnx", "torch"]')
parser.add_argument('--device', type=str, default='cpu', help='["cpu", "cuda"]')
parser.add_argument('--quantize', type=str2bool, default=False, help='export quantized model')
parser.add_argument('--fallback-num', type=int, default=0, help='amp fallback number')
parser.add_argument('--audio_in', type=str, default=None, help='["wav", "wav.scp"]')
@ -269,6 +276,7 @@ if __name__ == '__main__':
export_model = ModelExport(
cache_dir=args.export_dir,
onnx=args.type == 'onnx',
device=args.device,
quant=args.quantize,
fallback_num=args.fallback_num,
audio_in=args.audio_in,

View File

@ -75,8 +75,8 @@ def preprocess_for_attn(x, mask, cache, pad_fn):
return x, cache
torch_version = float(".".join(torch.__version__.split(".")[:2]))
if torch_version >= 1.8:
torch_version = tuple([int(i) for i in torch.__version__.split(".")[:2]])
if torch_version >= (1, 8):
import torch.fx
torch.fx.wrap('preprocess_for_attn')

View File

@ -74,7 +74,7 @@ class ContextualDecoderLayer(nn.Module):
return x, tgt_mask, x_self_attn, x_src_attn
class ContexutalBiasDecoder(nn.Module):
class ContextualBiasDecoder(nn.Module):
def __init__(
self,
size,
@ -83,7 +83,7 @@ class ContexutalBiasDecoder(nn.Module):
normalize_before=True,
):
"""Construct an DecoderLayer object."""
super(ContexutalBiasDecoder, self).__init__()
super(ContextualBiasDecoder, self).__init__()
self.size = size
self.src_attn = src_attn
if src_attn is not None:
@ -186,7 +186,7 @@ class ContextualParaformerDecoder(ParaformerSANMDecoder):
),
)
self.dropout = nn.Dropout(dropout_rate)
self.bias_decoder = ContexutalBiasDecoder(
self.bias_decoder = ContextualBiasDecoder(
size=attention_dim,
src_attn=MultiHeadedAttentionCrossAtt(
attention_heads, attention_dim, src_attention_dropout_rate

View File

@ -104,7 +104,6 @@ class DecoderLayerSANM(nn.Module):
x = residual + self.dropout(self.src_attn(x, memory, memory_mask))
return x, tgt_mask, memory, memory_mask, cache
def forward_chunk(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
@ -400,7 +399,7 @@ class FsmnDecoderSCAMAOpt(BaseTransformerDecoder):
for i in range(self.att_layer_num):
decoder = self.decoders[i]
c = cache[i]
x, tgt_mask, memory, memory_mask, c_ret = decoder(
x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
x, tgt_mask, memory, memory_mask, cache=c
)
new_cache.append(c_ret)
@ -410,13 +409,13 @@ class FsmnDecoderSCAMAOpt(BaseTransformerDecoder):
j = i + self.att_layer_num
decoder = self.decoders2[i]
c = cache[j]
x, tgt_mask, memory, memory_mask, c_ret = decoder(
x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
x, tgt_mask, memory, memory_mask, cache=c
)
new_cache.append(c_ret)
for decoder in self.decoders3:
x, tgt_mask, memory, memory_mask, _ = decoder(
x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
x, tgt_mask, memory, None, cache=None
)
@ -1077,7 +1076,7 @@ class ParaformerSANMDecoder(BaseTransformerDecoder):
for i in range(self.att_layer_num):
decoder = self.decoders[i]
c = cache[i]
x, tgt_mask, memory, memory_mask, c_ret = decoder(
x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
x, tgt_mask, memory, None, cache=c
)
new_cache.append(c_ret)
@ -1087,14 +1086,14 @@ class ParaformerSANMDecoder(BaseTransformerDecoder):
j = i + self.att_layer_num
decoder = self.decoders2[i]
c = cache[j]
x, tgt_mask, memory, memory_mask, c_ret = decoder(
x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
x, tgt_mask, memory, None, cache=c
)
new_cache.append(c_ret)
for decoder in self.decoders3:
x, tgt_mask, memory, memory_mask, _ = decoder(
x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
x, tgt_mask, memory, None, cache=None
)

View File

@ -370,19 +370,10 @@ class Paraformer(AbsESPnetModel):
encoder_out, encoder_out_lens
)
assert encoder_out.size(0) == speech.size(0), (
encoder_out.size(),
speech.size(0),
)
assert encoder_out.size(1) <= encoder_out_lens.max(), (
encoder_out.size(),
encoder_out_lens.max(),
)
if intermediate_outs is not None:
return (encoder_out, intermediate_outs), encoder_out_lens
return encoder_out, encoder_out_lens
return encoder_out, torch.tensor([encoder_out.size(1)])
def calc_predictor(self, encoder_out, encoder_out_lens):
@ -1034,16 +1025,76 @@ class BiCifParaformer(Paraformer):
# 1. Encoder
encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
intermediate_outs = None
if isinstance(encoder_out, tuple):
intermediate_outs = encoder_out[1]
encoder_out = encoder_out[0]
loss_att, acc_att, cer_att, wer_att = None, None, None, None
loss_ctc, cer_ctc = None, None
loss_pre = None
stats = dict()
# 1. CTC branch
if self.ctc_weight != 0.0:
loss_ctc, cer_ctc = self._calc_ctc_loss(
encoder_out, encoder_out_lens, text, text_lengths
)
# Collect CTC branch stats
stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
stats["cer_ctc"] = cer_ctc
# Intermediate CTC (optional)
loss_interctc = 0.0
if self.interctc_weight != 0.0 and intermediate_outs is not None:
for layer_idx, intermediate_out in intermediate_outs:
# we assume intermediate_out has the same length & padding
# as those of encoder_out
loss_ic, cer_ic = self._calc_ctc_loss(
intermediate_out, encoder_out_lens, text, text_lengths
)
loss_interctc = loss_interctc + loss_ic
# Collect Intermedaite CTC stats
stats["loss_interctc_layer{}".format(layer_idx)] = (
loss_ic.detach() if loss_ic is not None else None
)
stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic
loss_interctc = loss_interctc / len(intermediate_outs)
# calculate whole encoder loss
loss_ctc = (
1 - self.interctc_weight
) * loss_ctc + self.interctc_weight * loss_interctc
# 2b. Attention decoder branch
if self.ctc_weight != 1.0:
loss_att, acc_att, cer_att, wer_att, loss_pre = self._calc_att_loss(
encoder_out, encoder_out_lens, text, text_lengths
)
loss_pre2 = self._calc_pre2_loss(
encoder_out, encoder_out_lens, text, text_lengths
)
loss = loss_pre2
# 3. CTC-Att loss definition
if self.ctc_weight == 0.0:
loss = loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5
elif self.ctc_weight == 1.0:
loss = loss_ctc
else:
loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5
# Collect Attn branch stats
stats["loss_att"] = loss_att.detach() if loss_att is not None else None
stats["acc"] = acc_att
stats["cer"] = cer_att
stats["wer"] = wer_att
stats["loss_pre"] = loss_pre.detach().cpu() if loss_pre is not None else None
stats["loss_pre2"] = loss_pre2.detach().cpu()
stats["loss"] = torch.clone(loss.detach())
# force_gatherable: to-device and to-tensor if scalar for DataParallel
@ -1094,6 +1145,7 @@ class ContextualParaformer(Paraformer):
inner_dim: int = 256,
bias_encoder_type: str = 'lstm',
label_bracket: bool = False,
use_decoder_embedding: bool = False,
):
assert check_argument_types()
assert 0.0 <= ctc_weight <= 1.0, ctc_weight
@ -1147,6 +1199,7 @@ class ContextualParaformer(Paraformer):
self.hotword_buffer = None
self.length_record = []
self.current_buffer_length = 0
self.use_decoder_embedding = use_decoder_embedding
def forward(
self,
@ -1288,7 +1341,10 @@ class ContextualParaformer(Paraformer):
hw_list.append(hw_tokens)
# padding
hw_list_pad = pad_list(hw_list, 0)
hw_embed = self.decoder.embed(hw_list_pad)
if self.use_decoder_embedding:
hw_embed = self.decoder.embed(hw_list_pad)
else:
hw_embed = self.bias_embed(hw_list_pad)
hw_embed, (_, _) = self.bias_encoder(hw_embed)
_ind = np.arange(0, len(hw_list)).tolist()
# update self.hotword_buffer, throw a part if oversize
@ -1404,13 +1460,19 @@ class ContextualParaformer(Paraformer):
# default hotword list
hw_list = [torch.Tensor([self.sos]).long().to(encoder_out.device)] # empty hotword list
hw_list_pad = pad_list(hw_list, 0)
hw_embed = self.bias_embed(hw_list_pad)
if self.use_decoder_embedding:
hw_embed = self.decoder.embed(hw_list_pad)
else:
hw_embed = self.bias_embed(hw_list_pad)
_, (h_n, _) = self.bias_encoder(hw_embed)
contextual_info = h_n.squeeze(0).repeat(encoder_out.shape[0], 1, 1)
else:
hw_lengths = [len(i) for i in hw_list]
hw_list_pad = pad_list([torch.Tensor(i).long() for i in hw_list], 0).to(encoder_out.device)
hw_embed = self.bias_embed(hw_list_pad)
if self.use_decoder_embedding:
hw_embed = self.decoder.embed(hw_list_pad)
else:
hw_embed = self.bias_embed(hw_list_pad)
hw_embed = torch.nn.utils.rnn.pack_padded_sequence(hw_embed, hw_lengths, batch_first=True,
enforce_sorted=False)
_, (h_n, _) = self.bias_encoder(hw_embed)

View File

@ -200,6 +200,7 @@ class CifPredictorV2(nn.Module):
return acoustic_embeds, token_num, alphas, cif_peak
def forward_chunk(self, hidden, cache=None):
b, t, d = hidden.size()
h = hidden
context = h.transpose(1, 2)
queries = self.pad(context)
@ -220,6 +221,8 @@ class CifPredictorV2(nn.Module):
alphas = alphas * mask_chunk_predictor
if cache is not None:
if cache["is_final"]:
alphas[:, cache["stride"] + cache["pad_left"] - 1] += 0.45
if cache["cif_hidden"] is not None:
hidden = torch.cat((cache["cif_hidden"], hidden), 1)
if cache["cif_alphas"] is not None:
@ -241,7 +244,6 @@ class CifPredictorV2(nn.Module):
mask_chunk_peak_predictor[:, :pre_alphas_length] = 1.0
mask_chunk_peak_predictor[:, pre_alphas_length + cache["pad_left"]:pre_alphas_length + cache["stride"] + cache["pad_left"]] = 1.0
if mask_chunk_peak_predictor is not None:
cif_peak = cif_peak * mask_chunk_peak_predictor.squeeze(-1)

View File

@ -8,7 +8,7 @@
import math
import torch
import torch.nn.functional as F
def _pre_hook(
state_dict,
@ -409,9 +409,18 @@ class SinusoidalPositionEncoder(torch.nn.Module):
def forward_chunk(self, x, cache=None):
start_idx = 0
pad_left = 0
pad_right = 0
batch_size, timesteps, input_dim = x.size()
if cache is not None:
start_idx = cache["start_idx"]
pad_left = cache["left"]
pad_right = cache["right"]
positions = torch.arange(1, timesteps+start_idx+1)[None, :]
position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
return x + position_encoding[:, start_idx: start_idx + timesteps]
outputs = x + position_encoding[:, start_idx: start_idx + timesteps]
outputs = outputs.transpose(1,2)
outputs = F.pad(outputs, (pad_left, pad_right))
outputs = outputs.transpose(1,2)
return outputs

View File

@ -53,6 +53,68 @@ cd ../python/grpc
python grpc_main_client_mic.py --host $server_ip --port 10108
```
The `grpc_main_client_mic.py` follows the [original design] (https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc#workflow-in-desgin) by sending audio_data with chunks. If you want to send audio_data in one request, here is an example:
```
# go to ../python/grpc to find this package
import paraformer_pb2
class RecognizeStub:
def __init__(self, channel):
self.Recognize = channel.stream_stream(
'/paraformer.ASR/Recognize',
request_serializer=paraformer_pb2.Request.SerializeToString,
response_deserializer=paraformer_pb2.Response.FromString,
)
async def send(channel, data, speaking, isEnd):
stub = RecognizeStub(channel)
req = paraformer_pb2.Request()
if data:
req.audio_data = data
req.user = 'zz'
req.language = 'zh-CN'
req.speaking = speaking
req.isEnd = isEnd
q = queue.SimpleQueue()
q.put(req)
return stub.Recognize(iter(q.get, None))
# send the audio data once
async def grpc_rec(data, grpc_uri):
with grpc.insecure_channel(grpc_uri) as channel:
b = time.time()
response = await send(channel, data, False, False)
resp = response.next()
text = ''
if 'decoding' == resp.action:
resp = response.next()
if 'finish' == resp.action:
text = json.loads(resp.sentence)['text']
response = await send(channel, None, False, True)
return {
'text': text,
'time': time.time() - b,
}
async def test():
# fc = FunAsrGrpcClient('127.0.0.1', 9900)
# t = await fc.rec(wav.tobytes())
# print(t)
wav, _ = sf.read('z-10s.wav', dtype='int16')
uri = '127.0.0.1:9900'
res = await grpc_rec(wav.tobytes(), uri)
print(res)
if __name__ == '__main__':
asyncio.run(test())
```
## Acknowledge
1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
2. We acknowledge [DeepScience](https://www.deepscience.cn) for contributing the grpc service.

View File

@ -88,7 +88,7 @@ grpc::Status ASRServicer::Recognize(
res.set_language(req.language());
stream->Write(res);
} else if (!req.speaking()) {
if (client_buffers.count(req.user()) == 0) {
if (client_buffers.count(req.user()) == 0 && req.audio_data().size() == 0) {
Response res;
res.set_sentence(
R"({"success": true, "detail": "waiting_for_voice"})"
@ -99,14 +99,18 @@ grpc::Status ASRServicer::Recognize(
stream->Write(res);
}else {
auto begin_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
if (req.audio_data().size() > 0) {
auto& buf = client_buffers[req.user()];
buf.insert(buf.end(), req.audio_data().begin(), req.audio_data().end());
}
std::string tmp_data = this->client_buffers[req.user()];
this->clear_states(req.user());
Response res;
res.set_sentence(
R"({"success": true, "detail": "decoding data: " + std::to_string(tmp_data.length()) + " bytes"})"
);
int data_len_int = tmp_data.length();
int data_len_int = tmp_data.length();
std::string data_len = std::to_string(data_len_int);
std::stringstream ss;
ss << R"({"success": true, "detail": "decoding data: )" << data_len << R"( bytes")" << R"("})";
@ -129,18 +133,18 @@ grpc::Status ASRServicer::Recognize(
res.set_user(req.user());
res.set_action("finish");
res.set_language(req.language());
stream->Write(res);
}
else {
RPASR_RESULT Result= RapidAsrRecogPCMBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, RASR_NONE, NULL);
RPASR_RESULT Result= RapidAsrRecogPCMBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, RASR_NONE, NULL);
std::string asr_result = ((RPASR_RECOG_RESULT*)Result)->msg;
auto end_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
std::string delay_str = std::to_string(end_time - begin_time);
std::cout << "user: " << req.user() << " , delay(ms): " << delay_str << ", text: " << asr_result << std::endl;
Response res;
std::stringstream ss;
@ -150,8 +154,8 @@ grpc::Status ASRServicer::Recognize(
res.set_user(req.user());
res.set_action("finish");
res.set_language(req.language());
stream->Write(res);
}
}
@ -165,7 +169,7 @@ grpc::Status ASRServicer::Recognize(
res.set_language(req.language());
stream->Write(res);
}
}
}
return Status::OK;
}

View File

@ -109,7 +109,7 @@ class ASRServicer(paraformer_pb2_grpc.ASRServicer):
else:
asr_result = ""
elif self.backend == "onnxruntime":
from rapid_paraformer.utils.frontend import load_bytes
from funasr_onnx.utils.frontend import load_bytes
array = load_bytes(tmp_data)
asr_result = self.inference_16k_pipeline(array)[0]
end_time = int(round(time.time() * 1000))

View File

@ -31,7 +31,7 @@
```shell
git clone https://github.com/alibaba/FunASR.git && cd FunASR
cd funasr/runtime/python/funasr_torch
cd funasr/runtime/python/libtorch
python setup.py build
python setup.py install
```

View File

@ -1,10 +1,15 @@
from funasr_torch import Paraformer
model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model = Paraformer(model_dir, batch_size=1)
wav_path = ['/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model = Paraformer(model_dir, batch_size=1) # cpu
# model = Paraformer(model_dir, batch_size=1, device_id=0) # gpu
# when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps
# model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png")
wav_path = "YourPath/xx.wav"
result = model(wav_path)
print(result)
print(result)

View File

@ -46,6 +46,7 @@ class Paraformer():
)
self.ort_infer = torch.jit.load(model_file)
self.batch_size = batch_size
self.device_id = device_id
self.plot_timestamp_to = plot_timestamp_to
self.pred_bias = pred_bias
@ -58,8 +59,13 @@ class Paraformer():
end_idx = min(waveform_nums, beg_idx + self.batch_size)
feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
try:
outputs = self.ort_infer(feats, feats_len)
am_scores, valid_token_lens = outputs[0], outputs[1]
with torch.no_grad():
if int(self.device_id) == -1:
outputs = self.ort_infer(feats, feats_len)
am_scores, valid_token_lens = outputs[0], outputs[1]
else:
outputs = self.ort_infer(feats.cuda(), feats_len.cuda())
am_scores, valid_token_lens = outputs[0].cpu(), outputs[1].cpu()
if len(outputs) == 4:
# for BiCifParaformer Inference
us_alphas, us_peaks = outputs[2], outputs[3]

View File

@ -32,7 +32,7 @@ or install from source code
```shell
git clone https://github.com/alibaba/FunASR.git && cd FunASR
cd funasr/runtime/python/funasr_onnx
cd funasr/runtime/python/onnxruntime
python setup.py build
python setup.py install
```

View File

@ -1,13 +1,15 @@
from funasr_onnx import Paraformer
model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch"
# if you use paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch, you should set pred_bias=0
# plot_timestamp_to works only when using speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch
model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0)
model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
wav_path = "/Users/shixian/code/funasr/export/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/example/asr_example.wav"
model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0) # cpu
# model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0, device_id=0) # gpu
# when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps
# model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png")
wav_path = "YourPath/xx.wav"
result = model(wav_path)
print(result)
print(result)

View File

@ -464,6 +464,12 @@ class AbsTask(ABC):
default=sys.maxsize,
help="The maximum number update step to train",
)
parser.add_argument(
"--batch_interval",
type=int,
default=10000,
help="The batch interval for saving model.",
)
group.add_argument(
"--patience",
type=int_or_none,
@ -1355,15 +1361,15 @@ class AbsTask(ABC):
from funasr.datasets.large_datasets.build_dataloader import ArkDataLoader
train_iter_factory = ArkDataLoader(args.train_data_file, args.token_list, args.dataset_conf,
frontend_conf=args.frontend_conf if hasattr(args, "frontend_conf") else None,
seg_dict_file=args.seg_dict_file if hasattr(args,
"seg_dict_file") else None,
seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
punc_dict_file=args.punc_list if hasattr(args, "punc_list") else None,
bpemodel_file=args.bpemodel if hasattr(args, "bpemodel") else None,
mode="train")
valid_iter_factory = ArkDataLoader(args.valid_data_file, args.token_list, args.dataset_conf,
frontend_conf=args.frontend_conf if hasattr(args, "frontend_conf") else None,
seg_dict_file=args.seg_dict_file if hasattr(args,
"seg_dict_file") else None,
seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
punc_dict_file=args.punc_list if hasattr(args, "punc_list") else None,
bpemodel_file=args.bpemodel if hasattr(args, "bpemodel") else None,
mode="eval")
elif args.dataset_type == "small":
train_iter_factory = cls.build_iter_factory(
@ -1576,13 +1582,18 @@ class AbsTask(ABC):
) -> AbsIterFactory:
assert check_argument_types()
if args.frontend_conf is not None and "fs" in args.frontend_conf:
dest_sample_rate = args.frontend_conf["fs"]
else:
dest_sample_rate = 16000
dataset = ESPnetDataset(
iter_options.data_path_and_name_and_type,
float_dtype=args.train_dtype,
preprocess=iter_options.preprocess_fn,
max_cache_size=iter_options.max_cache_size,
max_cache_fd=iter_options.max_cache_fd,
dest_sample_rate=args.frontend_conf["fs"],
dest_sample_rate=dest_sample_rate,
)
cls.check_task_requirements(
dataset, args.allow_variable_data_keys, train=iter_options.train

View File

@ -412,12 +412,6 @@ class ASRTask(AbsTask):
default="13_15",
help="The range of noise decibel level.",
)
parser.add_argument(
"--batch_interval",
type=int,
default=10000,
help="The batch interval for saving model.",
)
for class_choices in cls.class_choices_list:
# Append --<name> and --<name>_conf.

View File

@ -579,9 +579,10 @@ class Trainer:
reporter.measure_iter_time(iterator, "iter_time"), 1
):
assert isinstance(batch, dict), type(batch)
if rank == 0 and hasattr(model.module, "num_updates"):
num_batch_updates = model.module.get_num_updates()
if rank == 0:
if hasattr(model, "num_updates") or (hasattr(model, "module") and hasattr(model.module, "num_updates")):
num_batch_updates = model.get_num_updates() if hasattr(model,"num_updates") else model.module.get_num_updates()
if (num_batch_updates%batch_interval == 0) and (options.oss_bucket is not None) and options.use_pai:
buffer = BytesIO()
torch.save(model.state_dict(), buffer)

View File

@ -45,8 +45,8 @@ def compute_wer(ref_file,
if out_item['wrong'] > 0:
rst['wrong_sentences'] += 1
cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
cer_detail_writer.write("ref:" + '\t' + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + '\n')
cer_detail_writer.write("hyp:" + '\t' + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + '\n')
if rst['Wrd'] > 0:
rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)

View File

@ -1 +1 @@
0.3.2
0.3.3