mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
Merge branch 'main' into dev_cmz2
This commit is contained in:
commit
2e769fb36c
53
egs/aishell/transformer/utils/cmvn_converter.py
Normal file
53
egs/aishell/transformer/utils/cmvn_converter.py
Normal file
@ -0,0 +1,53 @@
|
||||
import argparse
|
||||
import json
|
||||
import numpy as np
|
||||
|
||||
|
||||
def get_parser():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="cmvn converter",
|
||||
formatter_class=argparse.ArgumentDefaultsHelpFormatter,
|
||||
)
|
||||
parser.add_argument(
|
||||
"--cmvn-json",
|
||||
"-c",
|
||||
default=False,
|
||||
required=True,
|
||||
type=str,
|
||||
help="cmvn json file",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--am-mvn",
|
||||
"-a",
|
||||
default=False,
|
||||
required=True,
|
||||
type=str,
|
||||
help="am mvn file",
|
||||
)
|
||||
return parser
|
||||
|
||||
def main():
|
||||
parser = get_parser()
|
||||
args = parser.parse_args()
|
||||
|
||||
with open(args.cmvn_json, "r") as fin:
|
||||
cmvn_dict = json.load(fin)
|
||||
|
||||
mean_stats = np.array(cmvn_dict["mean_stats"])
|
||||
var_stats = np.array(cmvn_dict["var_stats"])
|
||||
total_frame = np.array(cmvn_dict["total_frames"])
|
||||
|
||||
mean = -1.0 * mean_stats / total_frame
|
||||
var = 1.0 / np.sqrt(var_stats / total_frame - mean * mean)
|
||||
dims = mean.shape[0]
|
||||
with open(args.am_mvn, 'w') as fout:
|
||||
fout.write("<Nnet>" + "\n" + "<Splice> " + str(dims) + " " + str(dims) + '\n' + "[ 0 ]" + "\n" + "<AddShift> " + str(dims) + " " + str(dims) + "\n")
|
||||
mean_str = str(list(mean)).replace(',', '').replace('[', '[ ').replace(']', ' ]')
|
||||
fout.write("<LearnRateCoef> 0 " + mean_str + '\n')
|
||||
fout.write("<Rescale> " + str(dims) + " " + str(dims) + '\n')
|
||||
var_str = str(list(var)).replace(',', '').replace('[', '[ ').replace(']', ' ]')
|
||||
fout.write("<LearnRateCoef> 0 " + var_str + '\n')
|
||||
fout.write("</Nnet>" + '\n')
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@ -45,8 +45,8 @@ def compute_wer(ref_file,
|
||||
if out_item['wrong'] > 0:
|
||||
rst['wrong_sentences'] += 1
|
||||
cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
|
||||
cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
|
||||
cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
|
||||
cer_detail_writer.write("ref:" + '\t' + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + '\n')
|
||||
cer_detail_writer.write("hyp:" + '\t' + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + '\n')
|
||||
|
||||
if rst['Wrd'] > 0:
|
||||
rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
|
||||
|
||||
@ -0,0 +1,6 @@
|
||||
beam_size: 10
|
||||
penalty: 0.0
|
||||
maxlenratio: 0.0
|
||||
minlenratio: 0.0
|
||||
ctc_weight: 0.5
|
||||
lm_weight: 0.7
|
||||
80
egs/librispeech/conformer/conf/train_asr_conformer.yaml
Normal file
80
egs/librispeech/conformer/conf/train_asr_conformer.yaml
Normal file
@ -0,0 +1,80 @@
|
||||
encoder: conformer
|
||||
encoder_conf:
|
||||
output_size: 512
|
||||
attention_heads: 8
|
||||
linear_units: 2048
|
||||
num_blocks: 12
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
attention_dropout_rate: 0.1
|
||||
input_layer: conv2d
|
||||
normalize_before: true
|
||||
macaron_style: true
|
||||
rel_pos_type: latest
|
||||
pos_enc_layer_type: rel_pos
|
||||
selfattention_layer_type: rel_selfattn
|
||||
activation_type: swish
|
||||
use_cnn_module: true
|
||||
cnn_module_kernel: 31
|
||||
|
||||
decoder: transformer
|
||||
decoder_conf:
|
||||
attention_heads: 8
|
||||
linear_units: 2048
|
||||
num_blocks: 6
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
self_attention_dropout_rate: 0.1
|
||||
src_attention_dropout_rate: 0.1
|
||||
|
||||
model_conf:
|
||||
ctc_weight: 0.3
|
||||
lsm_weight: 0.1
|
||||
length_normalized_loss: false
|
||||
|
||||
accum_grad: 2
|
||||
max_epoch: 50
|
||||
patience: none
|
||||
init: none
|
||||
best_model_criterion:
|
||||
- - valid
|
||||
- acc
|
||||
- max
|
||||
keep_nbest_models: 10
|
||||
|
||||
optim: adam
|
||||
optim_conf:
|
||||
lr: 0.0025
|
||||
weight_decay: 0.000001
|
||||
scheduler: warmuplr
|
||||
scheduler_conf:
|
||||
warmup_steps: 40000
|
||||
|
||||
specaug: specaug
|
||||
specaug_conf:
|
||||
apply_time_warp: true
|
||||
time_warp_window: 5
|
||||
time_warp_mode: bicubic
|
||||
apply_freq_mask: true
|
||||
freq_mask_width_range:
|
||||
- 0
|
||||
- 27
|
||||
num_freq_mask: 2
|
||||
apply_time_mask: true
|
||||
time_mask_width_ratio_range:
|
||||
- 0.
|
||||
- 0.05
|
||||
num_time_mask: 10
|
||||
|
||||
dataset_conf:
|
||||
shuffle: True
|
||||
shuffle_conf:
|
||||
shuffle_size: 1024
|
||||
sort_size: 500
|
||||
batch_conf:
|
||||
batch_type: token
|
||||
batch_size: 10000
|
||||
num_workers: 8
|
||||
|
||||
log_interval: 50
|
||||
normalize: None
|
||||
@ -0,0 +1,80 @@
|
||||
encoder: conformer
|
||||
encoder_conf:
|
||||
output_size: 512
|
||||
attention_heads: 8
|
||||
linear_units: 2048
|
||||
num_blocks: 12
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
attention_dropout_rate: 0.1
|
||||
input_layer: conv2d
|
||||
normalize_before: true
|
||||
macaron_style: true
|
||||
rel_pos_type: latest
|
||||
pos_enc_layer_type: rel_pos
|
||||
selfattention_layer_type: rel_selfattn
|
||||
activation_type: swish
|
||||
use_cnn_module: true
|
||||
cnn_module_kernel: 31
|
||||
|
||||
decoder: transformer
|
||||
decoder_conf:
|
||||
attention_heads: 8
|
||||
linear_units: 2048
|
||||
num_blocks: 6
|
||||
dropout_rate: 0.1
|
||||
positional_dropout_rate: 0.1
|
||||
self_attention_dropout_rate: 0.1
|
||||
src_attention_dropout_rate: 0.1
|
||||
|
||||
model_conf:
|
||||
ctc_weight: 0.3
|
||||
lsm_weight: 0.1
|
||||
length_normalized_loss: false
|
||||
|
||||
accum_grad: 2
|
||||
max_epoch: 50
|
||||
patience: none
|
||||
init: none
|
||||
best_model_criterion:
|
||||
- - valid
|
||||
- acc
|
||||
- max
|
||||
keep_nbest_models: 10
|
||||
|
||||
optim: adam
|
||||
optim_conf:
|
||||
lr: 0.0025
|
||||
weight_decay: 0.000001
|
||||
scheduler: warmuplr
|
||||
scheduler_conf:
|
||||
warmup_steps: 40000
|
||||
|
||||
specaug: specaug
|
||||
specaug_conf:
|
||||
apply_time_warp: true
|
||||
time_warp_window: 5
|
||||
time_warp_mode: bicubic
|
||||
apply_freq_mask: true
|
||||
freq_mask_width_range:
|
||||
- 0
|
||||
- 27
|
||||
num_freq_mask: 2
|
||||
apply_time_mask: true
|
||||
time_mask_width_ratio_range:
|
||||
- 0.
|
||||
- 0.05
|
||||
num_time_mask: 10
|
||||
|
||||
dataset_conf:
|
||||
shuffle: True
|
||||
shuffle_conf:
|
||||
shuffle_size: 1024
|
||||
sort_size: 500
|
||||
batch_conf:
|
||||
batch_type: token
|
||||
batch_size: 10000
|
||||
num_workers: 8
|
||||
|
||||
log_interval: 50
|
||||
normalize: utterance_mvn
|
||||
58
egs/librispeech/conformer/local/data_prep_librispeech.sh
Executable file
58
egs/librispeech/conformer/local/data_prep_librispeech.sh
Executable file
@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
# Copyright 2014 Vassil Panayotov
|
||||
# 2014 Johns Hopkins University (author: Daniel Povey)
|
||||
# Apache 2.0
|
||||
|
||||
if [ "$#" -ne 2 ]; then
|
||||
echo "Usage: $0 <src-dir> <dst-dir>"
|
||||
echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
src=$1
|
||||
dst=$2
|
||||
|
||||
# all utterances are FLAC compressed
|
||||
if ! which flac >&/dev/null; then
|
||||
echo "Please install 'flac' on ALL worker nodes!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
spk_file=$src/../SPEAKERS.TXT
|
||||
|
||||
mkdir -p $dst || exit 1
|
||||
|
||||
[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
|
||||
[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1
|
||||
|
||||
|
||||
wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
|
||||
trans=$dst/text; [[ -f "$trans" ]] && rm $trans
|
||||
|
||||
for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
|
||||
reader=$(basename $reader_dir)
|
||||
if ! [ $reader -eq $reader ]; then # not integer.
|
||||
echo "$0: unexpected subdirectory name $reader"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
|
||||
chapter=$(basename $chapter_dir)
|
||||
if ! [ "$chapter" -eq "$chapter" ]; then
|
||||
echo "$0: unexpected chapter-subdirectory name $chapter"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
|
||||
awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac \n", $0, dir, $0}' >>$wav_scp|| exit 1
|
||||
|
||||
chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
|
||||
[ ! -f $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
|
||||
cat $chapter_trans >>$trans
|
||||
done
|
||||
done
|
||||
|
||||
echo "$0: successfully prepared data in $dst"
|
||||
|
||||
exit 0
|
||||
5
egs/librispeech/conformer/path.sh
Executable file
5
egs/librispeech/conformer/path.sh
Executable file
@ -0,0 +1,5 @@
|
||||
export FUNASR_DIR=$PWD/../../..
|
||||
|
||||
# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
|
||||
export PYTHONIOENCODING=UTF-8
|
||||
export PATH=$FUNASR_DIR/funasr/bin:$PATH
|
||||
262
egs/librispeech/conformer/run.sh
Executable file
262
egs/librispeech/conformer/run.sh
Executable file
@ -0,0 +1,262 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
. ./path.sh || exit 1;
|
||||
|
||||
# machines configuration
|
||||
CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
|
||||
gpu_num=8
|
||||
count=1
|
||||
gpu_inference=true # Whether to perform gpu decoding, set false for cpu decoding
|
||||
# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
|
||||
njob=5
|
||||
train_cmd=utils/run.pl
|
||||
infer_cmd=utils/run.pl
|
||||
|
||||
# general configuration
|
||||
feats_dir="../DATA" #feature output dictionary
|
||||
exp_dir="."
|
||||
lang=en
|
||||
dumpdir=dump/fbank
|
||||
feats_type=fbank
|
||||
token_type=bpe
|
||||
dataset_type=large
|
||||
scp=feats.scp
|
||||
type=kaldi_ark
|
||||
stage=3
|
||||
stop_stage=4
|
||||
|
||||
# feature configuration
|
||||
feats_dim=80
|
||||
sample_frequency=16000
|
||||
nj=100
|
||||
speed_perturb="0.9,1.0,1.1"
|
||||
|
||||
# data
|
||||
data_librispeech=
|
||||
|
||||
# bpe model
|
||||
nbpe=5000
|
||||
bpemode=unigram
|
||||
|
||||
# exp tag
|
||||
tag=""
|
||||
|
||||
. utils/parse_options.sh || exit 1;
|
||||
|
||||
# Set bash to 'debug' mode, it will exit on :
|
||||
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
|
||||
set -e
|
||||
set -u
|
||||
set -o pipefail
|
||||
|
||||
train_set=train_960
|
||||
valid_set=dev
|
||||
test_sets="test_clean test_other dev_clean dev_other"
|
||||
|
||||
asr_config=conf/train_asr_conformer.yaml
|
||||
#asr_config=conf/train_asr_conformer_uttnorm.yaml
|
||||
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
|
||||
|
||||
inference_config=conf/decode_asr_transformer.yaml
|
||||
#inference_config=conf/decode_asr_transformer_beam60_ctc0.3.yaml
|
||||
inference_asr_model=valid.acc.ave_10best.pth
|
||||
|
||||
# you can set gpu num for decoding here
|
||||
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
|
||||
ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
|
||||
|
||||
if ${gpu_inference}; then
|
||||
inference_nj=$[${ngpu}*${njob}]
|
||||
_ngpu=1
|
||||
else
|
||||
inference_nj=$njob
|
||||
_ngpu=0
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
echo "stage 0: Data preparation"
|
||||
# Data preparation
|
||||
for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do
|
||||
local/data_prep_librispeech.sh ${data_librispeech}/LibriSpeech/${x} ${feats_dir}/data/${x//-/_}
|
||||
done
|
||||
fi
|
||||
|
||||
feat_train_dir=${feats_dir}/${dumpdir}/$train_set; mkdir -p ${feat_train_dir}
|
||||
feat_dev_clean_dir=${feats_dir}/${dumpdir}/dev_clean; mkdir -p ${feat_dev_clean_dir}
|
||||
feat_dev_other_dir=${feats_dir}/${dumpdir}/dev_other; mkdir -p ${feat_dev_other_dir}
|
||||
feat_test_clean_dir=${feats_dir}/${dumpdir}/test_clean; mkdir -p ${feat_test_clean_dir}
|
||||
feat_test_other_dir=${feats_dir}/${dumpdir}/test_other; mkdir -p ${feat_test_other_dir}
|
||||
feat_dev_dir=${feats_dir}/${dumpdir}/$valid_set; mkdir -p ${feat_dev_dir}
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
echo "stage 1: Feature Generation"
|
||||
# compute fbank features
|
||||
fbankdir=${feats_dir}/fbank
|
||||
for x in dev_clean dev_other test_clean test_other; do
|
||||
utils/compute_fbank.sh --cmd "$train_cmd" --nj 1 --max_lengths 3000 --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} \
|
||||
${feats_dir}/data/${x} ${exp_dir}/exp/make_fbank/${x} ${fbankdir}/${x}
|
||||
utils/fix_data_feat.sh ${fbankdir}/${x}
|
||||
done
|
||||
|
||||
mkdir ${feats_dir}/data/$train_set
|
||||
train_sets="train_clean_100 train_clean_360 train_other_500"
|
||||
for file in wav.scp text; do
|
||||
( for f in $train_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$train_set/$file || exit 1;
|
||||
done
|
||||
utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --max_lengths 3000 --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} --speed_perturb ${speed_perturb} \
|
||||
${feats_dir}/data/$train_set ${exp_dir}/exp/make_fbank/$train_set ${fbankdir}/$train_set
|
||||
utils/fix_data_feat.sh ${fbankdir}/$train_set
|
||||
|
||||
# compute global cmvn
|
||||
utils/compute_cmvn.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} \
|
||||
${fbankdir}/$train_set ${exp_dir}/exp/make_fbank/$train_set
|
||||
|
||||
# apply cmvn
|
||||
utils/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \
|
||||
${fbankdir}/$train_set ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/$train_set ${feat_train_dir}
|
||||
utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
|
||||
${fbankdir}/dev_clean ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/dev_clean ${feat_dev_clean_dir}
|
||||
utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1\
|
||||
${fbankdir}/dev_other ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/dev_other ${feat_dev_other_dir}
|
||||
utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
|
||||
${fbankdir}/test_clean ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/test_clean ${feat_test_clean_dir}
|
||||
utils/apply_cmvn.sh --cmd "$train_cmd" --nj 1 \
|
||||
${fbankdir}/test_other ${fbankdir}/$train_set/cmvn.json ${exp_dir}/exp/make_fbank/test_other ${feat_test_other_dir}
|
||||
|
||||
cp ${fbankdir}/$train_set/text ${fbankdir}/$train_set/speech_shape ${fbankdir}/$train_set/text_shape ${feat_train_dir}
|
||||
cp ${fbankdir}/dev_clean/text ${fbankdir}/dev_clean/speech_shape ${fbankdir}/dev_clean/text_shape ${feat_dev_clean_dir}
|
||||
cp ${fbankdir}/dev_other/text ${fbankdir}/dev_other/speech_shape ${fbankdir}/dev_other/text_shape ${feat_dev_other_dir}
|
||||
cp ${fbankdir}/test_clean/text ${fbankdir}/test_clean/speech_shape ${fbankdir}/test_clean/text_shape ${feat_test_clean_dir}
|
||||
cp ${fbankdir}/test_other/text ${fbankdir}/test_other/speech_shape ${fbankdir}/test_other/text_shape ${feat_test_other_dir}
|
||||
|
||||
dev_sets="dev_clean dev_other"
|
||||
for file in feats.scp text speech_shape text_shape; do
|
||||
( for f in $dev_sets; do cat $feats_dir/${dumpdir}/$f/$file; done ) | sort -k1 > $feat_dev_dir/$file || exit 1;
|
||||
done
|
||||
|
||||
#generate ark list
|
||||
utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_train_dir} ${fbankdir}/${train_set} ${feat_train_dir}
|
||||
utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_dev_dir} ${fbankdir}/${valid_set} ${feat_dev_dir}
|
||||
fi
|
||||
|
||||
dict=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
|
||||
bpemodel=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}
|
||||
echo "dictionary: ${dict}"
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
### Task dependent. You have to check non-linguistic symbols used in the corpus.
|
||||
echo "stage 2: Dictionary and Json Data Preparation"
|
||||
mkdir -p ${feats_dir}/data/lang_char/
|
||||
echo "<blank>" > ${dict}
|
||||
echo "<s>" >> ${dict}
|
||||
echo "</s>" >> ${dict}
|
||||
cut -f 2- -d" " ${feats_dir}/data/${train_set}/text > ${feats_dir}/data/lang_char/input.txt
|
||||
spm_train --input=${feats_dir}/data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
|
||||
spm_encode --model=${bpemodel}.model --output_format=piece < ${feats_dir}/data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0}' >> ${dict}
|
||||
echo "<unk>" >> ${dict}
|
||||
wc -l ${dict}
|
||||
|
||||
vocab_size=$(cat ${dict} | wc -l)
|
||||
awk -v v=,${vocab_size} '{print $0v}' ${feat_train_dir}/text_shape > ${feat_train_dir}/text_shape.char
|
||||
awk -v v=,${vocab_size} '{print $0v}' ${feat_dev_dir}/text_shape > ${feat_dev_dir}/text_shape.char
|
||||
mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/$train_set
|
||||
mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/$valid_set
|
||||
cp ${feat_train_dir}/speech_shape ${feat_train_dir}/text_shape ${feat_train_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/$train_set
|
||||
cp ${feat_dev_dir}/speech_shape ${feat_dev_dir}/text_shape ${feat_dev_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/$valid_set
|
||||
fi
|
||||
|
||||
|
||||
# Training Stage
|
||||
world_size=$gpu_num # run on one machine
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
echo "stage 3: Training"
|
||||
mkdir -p ${exp_dir}/exp/${model_dir}
|
||||
mkdir -p ${exp_dir}/exp/${model_dir}/log
|
||||
INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init
|
||||
if [ -f $INIT_FILE ];then
|
||||
rm -f $INIT_FILE
|
||||
fi
|
||||
init_method=file://$(readlink -f $INIT_FILE)
|
||||
echo "$0: init method is $init_method"
|
||||
for ((i = 0; i < $gpu_num; ++i)); do
|
||||
{
|
||||
rank=$i
|
||||
local_rank=$i
|
||||
gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
|
||||
asr_train.py \
|
||||
--gpu_id $gpu_id \
|
||||
--use_preprocessor true \
|
||||
--split_with_space false \
|
||||
--bpemodel ${bpemodel}.model \
|
||||
--token_type $token_type \
|
||||
--dataset_type $dataset_type \
|
||||
--token_list $dict \
|
||||
--train_data_file $feats_dir/$dumpdir/${train_set}/ark_txt.scp \
|
||||
--valid_data_file $feats_dir/$dumpdir/${valid_set}/ark_txt.scp \
|
||||
--resume true \
|
||||
--output_dir ${exp_dir}/exp/${model_dir} \
|
||||
--config $asr_config \
|
||||
--input_size $feats_dim \
|
||||
--ngpu $gpu_num \
|
||||
--num_worker_count $count \
|
||||
--multiprocessing_distributed true \
|
||||
--dist_init_method $init_method \
|
||||
--dist_world_size $world_size \
|
||||
--dist_rank $rank \
|
||||
--local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
|
||||
} &
|
||||
done
|
||||
wait
|
||||
fi
|
||||
|
||||
# Testing Stage
|
||||
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
|
||||
echo "stage 4: Inference"
|
||||
for dset in ${test_sets}; do
|
||||
asr_exp=${exp_dir}/exp/${model_dir}
|
||||
inference_tag="$(basename "${inference_config}" .yaml)"
|
||||
_dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
|
||||
_logdir="${_dir}/logdir"
|
||||
if [ -d ${_dir} ]; then
|
||||
echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
|
||||
exit 0
|
||||
fi
|
||||
mkdir -p "${_logdir}"
|
||||
_data="${feats_dir}/${dumpdir}/${dset}"
|
||||
key_file=${_data}/${scp}
|
||||
num_scp_file="$(<${key_file} wc -l)"
|
||||
_nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file")
|
||||
split_scps=
|
||||
for n in $(seq "${_nj}"); do
|
||||
split_scps+=" ${_logdir}/keys.${n}.scp"
|
||||
done
|
||||
# shellcheck disable=SC2086
|
||||
utils/split_scp.pl "${key_file}" ${split_scps}
|
||||
_opts=
|
||||
if [ -n "${inference_config}" ]; then
|
||||
_opts+="--config ${inference_config} "
|
||||
fi
|
||||
${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
|
||||
python -m funasr.bin.asr_inference_launch \
|
||||
--batch_size 1 \
|
||||
--ngpu "${_ngpu}" \
|
||||
--njob ${njob} \
|
||||
--gpuid_list ${gpuid_list} \
|
||||
--data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
|
||||
--key_file "${_logdir}"/keys.JOB.scp \
|
||||
--asr_train_config "${asr_exp}"/config.yaml \
|
||||
--asr_model_file "${asr_exp}"/"${inference_asr_model}" \
|
||||
--output_dir "${_logdir}"/output.JOB \
|
||||
--mode asr \
|
||||
${_opts}
|
||||
|
||||
for f in token token_int score text; do
|
||||
if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
|
||||
for i in $(seq "${_nj}"); do
|
||||
cat "${_logdir}/output.${i}/1best_recog/${f}"
|
||||
done | sort -k1 >"${_dir}/${f}"
|
||||
fi
|
||||
done
|
||||
python utils/compute_wer.py ${_data}/text ${_dir}/text ${_dir}/text.cer
|
||||
tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
|
||||
cat ${_dir}/text.cer.txt
|
||||
done
|
||||
fi
|
||||
1
egs/librispeech/conformer/utils
Symbolic link
1
egs/librispeech/conformer/utils
Symbolic link
@ -0,0 +1 @@
|
||||
../../aishell/transformer/utils
|
||||
@ -74,7 +74,7 @@ def modelscope_infer(params):
|
||||
# If text exists, compute CER
|
||||
text_in = os.path.join(params["data_dir"], "text")
|
||||
if os.path.exists(text_in):
|
||||
text_proc_file = os.path.join(best_recog_path, "token")
|
||||
text_proc_file = os.path.join(best_recog_path, "text")
|
||||
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
|
||||
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ def modelscope_infer_after_finetune(params):
|
||||
# computer CER if GT text is set
|
||||
text_in = os.path.join(params["data_dir"], "text")
|
||||
if os.path.exists(text_in):
|
||||
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
|
||||
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
|
||||
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
||||
|
||||
|
||||
|
||||
@ -74,7 +74,7 @@ def modelscope_infer(params):
|
||||
# If text exists, compute CER
|
||||
text_in = os.path.join(params["data_dir"], "text")
|
||||
if os.path.exists(text_in):
|
||||
text_proc_file = os.path.join(best_recog_path, "token")
|
||||
text_proc_file = os.path.join(best_recog_path, "text")
|
||||
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
|
||||
|
||||
|
||||
|
||||
@ -38,7 +38,7 @@ def modelscope_infer_after_finetune(params):
|
||||
# computer CER if GT text is set
|
||||
text_in = os.path.join(params["data_dir"], "text")
|
||||
if os.path.exists(text_in):
|
||||
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
|
||||
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
|
||||
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
||||
|
||||
|
||||
|
||||
@ -17,7 +17,7 @@ def modelscope_infer(args):
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
|
||||
parser.add_argument('--audio_in', type=str, default="./data/test")
|
||||
parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
|
||||
parser.add_argument('--output_dir', type=str, default="./results/")
|
||||
parser.add_argument('--batch_size', type=int, default=64)
|
||||
parser.add_argument('--gpuid', type=str, default="0")
|
||||
|
||||
@ -63,8 +63,8 @@ fi
|
||||
|
||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
|
||||
echo "Computing WER ..."
|
||||
python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
|
||||
python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
|
||||
cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
|
||||
cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
|
||||
python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
|
||||
tail -n 3 ${output_dir}/1best_recog/text.cer
|
||||
fi
|
||||
|
||||
@ -34,7 +34,7 @@ def modelscope_infer_after_finetune(params):
|
||||
# computer CER if GT text is set
|
||||
text_in = os.path.join(params["data_dir"], "text")
|
||||
if os.path.exists(text_in):
|
||||
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
|
||||
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
|
||||
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
||||
|
||||
|
||||
|
||||
@ -17,7 +17,7 @@ def modelscope_infer(args):
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--model', type=str, default="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1")
|
||||
parser.add_argument('--audio_in', type=str, default="./data/test")
|
||||
parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
|
||||
parser.add_argument('--output_dir', type=str, default="./results/")
|
||||
parser.add_argument('--batch_size', type=int, default=64)
|
||||
parser.add_argument('--gpuid', type=str, default="0")
|
||||
|
||||
@ -63,8 +63,8 @@ fi
|
||||
|
||||
if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
|
||||
echo "Computing WER ..."
|
||||
python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
|
||||
python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref
|
||||
cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
|
||||
cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
|
||||
python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
|
||||
tail -n 3 ${output_dir}/1best_recog/text.cer
|
||||
fi
|
||||
|
||||
@ -34,7 +34,7 @@ def modelscope_infer_after_finetune(params):
|
||||
# computer CER if GT text is set
|
||||
text_in = os.path.join(params["data_dir"], "text")
|
||||
if os.path.exists(text_in):
|
||||
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
|
||||
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
|
||||
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
||||
|
||||
|
||||
|
||||
@ -75,7 +75,7 @@ def modelscope_infer(params):
|
||||
# If text exists, compute CER
|
||||
text_in = os.path.join(params["data_dir"], "text")
|
||||
if os.path.exists(text_in):
|
||||
text_proc_file = os.path.join(best_recog_path, "token")
|
||||
text_proc_file = os.path.join(best_recog_path, "text")
|
||||
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
|
||||
|
||||
|
||||
|
||||
@ -39,7 +39,7 @@ def modelscope_infer_after_finetune(params):
|
||||
# computer CER if GT text is set
|
||||
text_in = os.path.join(params["data_dir"], "text")
|
||||
if os.path.exists(text_in):
|
||||
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
|
||||
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
|
||||
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
||||
|
||||
|
||||
|
||||
@ -75,7 +75,7 @@ def modelscope_infer(params):
|
||||
# If text exists, compute CER
|
||||
text_in = os.path.join(params["data_dir"], "text")
|
||||
if os.path.exists(text_in):
|
||||
text_proc_file = os.path.join(best_recog_path, "token")
|
||||
text_proc_file = os.path.join(best_recog_path, "text")
|
||||
compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
|
||||
|
||||
|
||||
|
||||
@ -39,7 +39,7 @@ def modelscope_infer_after_finetune(params):
|
||||
# computer CER if GT text is set
|
||||
text_in = os.path.join(params["data_dir"], "text")
|
||||
if os.path.exists(text_in):
|
||||
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
|
||||
text_proc_file = os.path.join(decoding_path, "1best_recog/text")
|
||||
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
|
||||
|
||||
|
||||
|
||||
@ -797,7 +797,7 @@ def inference_modelscope(
|
||||
finish_count += 1
|
||||
# asr_utils.print_progress(finish_count / file_count)
|
||||
if writer is not None:
|
||||
ibest_writer["text"][key] = text_postprocessed
|
||||
ibest_writer["text"][key] = " ".join(word_lists)
|
||||
|
||||
logging.info("decoding, utt: {}, predictions: {}".format(key, text))
|
||||
rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))
|
||||
|
||||
@ -42,6 +42,7 @@ from funasr.utils import asr_utils, wav_utils, postprocess_utils
|
||||
from funasr.models.frontend.wav_frontend import WavFrontend
|
||||
from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
|
||||
from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
|
||||
np.set_printoptions(threshold=np.inf)
|
||||
|
||||
class Speech2Text:
|
||||
"""Speech2Text class
|
||||
@ -203,7 +204,6 @@ class Speech2Text:
|
||||
# Input as audio signal
|
||||
if isinstance(speech, np.ndarray):
|
||||
speech = torch.tensor(speech)
|
||||
|
||||
if self.frontend is not None:
|
||||
feats, feats_len = self.frontend.forward(speech, speech_lengths)
|
||||
feats = to_device(feats, device=self.device)
|
||||
@ -213,13 +213,16 @@ class Speech2Text:
|
||||
feats = speech
|
||||
feats_len = speech_lengths
|
||||
lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
|
||||
feats_len = cache["encoder"]["stride"] + cache["encoder"]["pad_left"] + cache["encoder"]["pad_right"]
|
||||
feats = feats[:,cache["encoder"]["start_idx"]:cache["encoder"]["start_idx"]+feats_len,:]
|
||||
feats_len = torch.tensor([feats_len])
|
||||
batch = {"speech": feats, "speech_lengths": feats_len, "cache": cache}
|
||||
|
||||
# a. To device
|
||||
batch = to_device(batch, device=self.device)
|
||||
|
||||
# b. Forward Encoder
|
||||
enc, enc_len = self.asr_model.encode_chunk(**batch)
|
||||
enc, enc_len = self.asr_model.encode_chunk(feats, feats_len, cache)
|
||||
if isinstance(enc, tuple):
|
||||
enc = enc[0]
|
||||
# assert len(enc) == 1, len(enc)
|
||||
@ -578,7 +581,22 @@ def inference_modelscope(
|
||||
speech2text = Speech2TextExport(**speech2text_kwargs)
|
||||
else:
|
||||
speech2text = Speech2Text(**speech2text_kwargs)
|
||||
|
||||
def _load_bytes(input):
|
||||
middle_data = np.frombuffer(input, dtype=np.int16)
|
||||
middle_data = np.asarray(middle_data)
|
||||
if middle_data.dtype.kind not in 'iu':
|
||||
raise TypeError("'middle_data' must be an array of integers")
|
||||
dtype = np.dtype('float32')
|
||||
if dtype.kind != 'f':
|
||||
raise TypeError("'dtype' must be a floating point type")
|
||||
|
||||
i = np.iinfo(middle_data.dtype)
|
||||
abs_max = 2 ** (i.bits - 1)
|
||||
offset = i.min + abs_max
|
||||
array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
|
||||
return array
|
||||
|
||||
def _forward(
|
||||
data_path_and_name_and_type,
|
||||
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
|
||||
@ -589,10 +607,12 @@ def inference_modelscope(
|
||||
):
|
||||
|
||||
# 3. Build data-iterator
|
||||
if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "bytes":
|
||||
raw_inputs = _load_bytes(data_path_and_name_and_type[0])
|
||||
raw_inputs = torch.tensor(raw_inputs)
|
||||
if data_path_and_name_and_type is None and raw_inputs is not None:
|
||||
if isinstance(raw_inputs, np.ndarray):
|
||||
raw_inputs = torch.tensor(raw_inputs)
|
||||
|
||||
is_final = False
|
||||
if param_dict is not None and "cache" in param_dict:
|
||||
cache = param_dict["cache"]
|
||||
@ -605,62 +625,87 @@ def inference_modelscope(
|
||||
asr_result = ""
|
||||
wait = True
|
||||
if len(cache) == 0:
|
||||
cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
|
||||
cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None, "is_final": is_final, "left": 0, "right": 0}
|
||||
cache_de = {"decode_fsmn": None}
|
||||
cache["decoder"] = cache_de
|
||||
cache["first_chunk"] = True
|
||||
cache["speech"] = []
|
||||
cache["chunk_index"] = 0
|
||||
cache["speech_chunk"] = []
|
||||
cache["accum_speech"] = 0
|
||||
|
||||
if raw_inputs is not None:
|
||||
if len(cache["speech"]) == 0:
|
||||
cache["speech"] = raw_inputs
|
||||
else:
|
||||
cache["speech"] = torch.cat([cache["speech"], raw_inputs], dim=0)
|
||||
if len(cache["speech_chunk"]) == 0:
|
||||
cache["speech_chunk"] = raw_inputs
|
||||
else:
|
||||
cache["speech_chunk"] = torch.cat([cache["speech_chunk"], raw_inputs], dim=0)
|
||||
while len(cache["speech_chunk"]) >= 960:
|
||||
cache["accum_speech"] += len(raw_inputs)
|
||||
while cache["accum_speech"] >= 960:
|
||||
if cache["first_chunk"]:
|
||||
if len(cache["speech_chunk"]) >= 14400:
|
||||
speech = torch.unsqueeze(cache["speech_chunk"][0:14400], axis=0)
|
||||
speech_length = torch.tensor([14400])
|
||||
if cache["accum_speech"] >= 14400:
|
||||
speech = torch.unsqueeze(cache["speech"], axis=0)
|
||||
speech_length = torch.tensor([len(cache["speech"])])
|
||||
cache["encoder"]["pad_left"] = 5
|
||||
cache["encoder"]["pad_right"] = 5
|
||||
cache["encoder"]["stride"] = 10
|
||||
cache["encoder"]["left"] = 5
|
||||
cache["encoder"]["right"] = 0
|
||||
results = speech2text(cache, speech, speech_length)
|
||||
cache["speech_chunk"]= cache["speech_chunk"][4800:]
|
||||
cache["accum_speech"] -= 4800
|
||||
cache["first_chunk"] = False
|
||||
cache["encoder"]["start_idx"] = -5
|
||||
cache["encoder"]["is_final"] = False
|
||||
wait = False
|
||||
else:
|
||||
if is_final:
|
||||
cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
|
||||
cache["encoder"]["stride"] = len(cache["speech"]) // 960
|
||||
cache["encoder"]["pad_left"] = 0
|
||||
cache["encoder"]["pad_right"] = 0
|
||||
speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
|
||||
speech_length = torch.tensor([len(cache["speech_chunk"])])
|
||||
speech = torch.unsqueeze(cache["speech"], axis=0)
|
||||
speech_length = torch.tensor([len(cache["speech"])])
|
||||
results = speech2text(cache, speech, speech_length)
|
||||
cache["speech_chunk"] = []
|
||||
cache["accum_speech"] = 0
|
||||
wait = False
|
||||
else:
|
||||
break
|
||||
else:
|
||||
if len(cache["speech_chunk"]) >= 19200:
|
||||
if cache["accum_speech"] >= 19200:
|
||||
cache["encoder"]["start_idx"] += 10
|
||||
cache["encoder"]["stride"] = 10
|
||||
cache["encoder"]["pad_left"] = 5
|
||||
speech = torch.unsqueeze(cache["speech_chunk"][:19200], axis=0)
|
||||
speech_length = torch.tensor([19200])
|
||||
cache["encoder"]["pad_right"] = 5
|
||||
cache["encoder"]["left"] = 0
|
||||
cache["encoder"]["right"] = 0
|
||||
speech = torch.unsqueeze(cache["speech"], axis=0)
|
||||
speech_length = torch.tensor([len(cache["speech"])])
|
||||
results = speech2text(cache, speech, speech_length)
|
||||
cache["speech_chunk"] = cache["speech_chunk"][9600:]
|
||||
cache["accum_speech"] -= 9600
|
||||
wait = False
|
||||
else:
|
||||
if is_final:
|
||||
cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
|
||||
cache["encoder"]["pad_right"] = 0
|
||||
speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
|
||||
speech_length = torch.tensor([len(cache["speech_chunk"])])
|
||||
results = speech2text(cache, speech, speech_length)
|
||||
cache["speech_chunk"] = []
|
||||
wait = False
|
||||
cache["encoder"]["is_final"] = True
|
||||
if cache["accum_speech"] >= 14400:
|
||||
cache["encoder"]["start_idx"] += 10
|
||||
cache["encoder"]["stride"] = 10
|
||||
cache["encoder"]["pad_left"] = 5
|
||||
cache["encoder"]["pad_right"] = 5
|
||||
cache["encoder"]["left"] = 0
|
||||
cache["encoder"]["right"] = cache["accum_speech"] // 960 - 15
|
||||
speech = torch.unsqueeze(cache["speech"], axis=0)
|
||||
speech_length = torch.tensor([len(cache["speech"])])
|
||||
results = speech2text(cache, speech, speech_length)
|
||||
cache["accum_speech"] -= 9600
|
||||
wait = False
|
||||
else:
|
||||
cache["encoder"]["start_idx"] += 10
|
||||
cache["encoder"]["stride"] = cache["accum_speech"] // 960 - 5
|
||||
cache["encoder"]["pad_left"] = 5
|
||||
cache["encoder"]["pad_right"] = 0
|
||||
cache["encoder"]["left"] = 0
|
||||
cache["encoder"]["right"] = 0
|
||||
speech = torch.unsqueeze(cache["speech"], axis=0)
|
||||
speech_length = torch.tensor([len(cache["speech"])])
|
||||
results = speech2text(cache, speech, speech_length)
|
||||
cache["accum_speech"] = 0
|
||||
wait = False
|
||||
else:
|
||||
break
|
||||
|
||||
|
||||
@ -338,7 +338,7 @@ def inference_modelscope(
|
||||
ibest_writer["token"][key] = " ".join(token)
|
||||
ibest_writer["token_int"][key] = " ".join(map(str, token_int))
|
||||
ibest_writer["vad"][key] = "{}".format(vadsegments)
|
||||
ibest_writer["text"][key] = text_postprocessed
|
||||
ibest_writer["text"][key] = " ".join(word_lists)
|
||||
ibest_writer["text_with_punc"][key] = text_postprocessed_punc
|
||||
if time_stamp_postprocessed is not None:
|
||||
ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)
|
||||
|
||||
@ -670,7 +670,7 @@ def inference_modelscope(
|
||||
ibest_writer["token"][key] = " ".join(token)
|
||||
ibest_writer["token_int"][key] = " ".join(map(str, token_int))
|
||||
ibest_writer["vad"][key] = "{}".format(vadsegments)
|
||||
ibest_writer["text"][key] = text_postprocessed
|
||||
ibest_writer["text"][key] = " ".join(word_lists)
|
||||
ibest_writer["text_with_punc"][key] = text_postprocessed_punc
|
||||
if time_stamp_postprocessed is not None:
|
||||
ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)
|
||||
|
||||
@ -738,13 +738,13 @@ def inference_modelscope(
|
||||
ibest_writer["rtf"][key] = rtf_cur
|
||||
|
||||
if text is not None:
|
||||
text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
|
||||
text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
|
||||
item = {'key': key, 'value': text_postprocessed}
|
||||
asr_result_list.append(item)
|
||||
finish_count += 1
|
||||
# asr_utils.print_progress(finish_count / file_count)
|
||||
if writer is not None:
|
||||
ibest_writer["text"][key] = text_postprocessed
|
||||
ibest_writer["text"][key] = " ".join(word_lists)
|
||||
|
||||
logging.info("decoding, utt: {}, predictions: {}".format(key, text))
|
||||
rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))
|
||||
|
||||
@ -504,13 +504,13 @@ def inference_modelscope(
|
||||
ibest_writer["score"][key] = str(hyp.score)
|
||||
|
||||
if text is not None:
|
||||
text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
|
||||
text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
|
||||
item = {'key': key, 'value': text_postprocessed}
|
||||
asr_result_list.append(item)
|
||||
finish_count += 1
|
||||
asr_utils.print_progress(finish_count / file_count)
|
||||
if writer is not None:
|
||||
ibest_writer["text"][key] = text_postprocessed
|
||||
ibest_writer["text"][key] = " ".join(word_lists)
|
||||
return asr_result_list
|
||||
|
||||
return _forward
|
||||
|
||||
@ -507,13 +507,13 @@ def inference_modelscope(
|
||||
ibest_writer["score"][key] = str(hyp.score)
|
||||
|
||||
if text is not None:
|
||||
text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
|
||||
text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
|
||||
item = {'key': key, 'value': text_postprocessed}
|
||||
asr_result_list.append(item)
|
||||
finish_count += 1
|
||||
asr_utils.print_progress(finish_count / file_count)
|
||||
if writer is not None:
|
||||
ibest_writer["text"][key] = text_postprocessed
|
||||
ibest_writer["text"][key] = " ".join(word_lists)
|
||||
return asr_result_list
|
||||
|
||||
return _forward
|
||||
|
||||
@ -37,7 +37,7 @@ def tokenize(data,
|
||||
vad = -2
|
||||
|
||||
if bpe_tokenizer is not None:
|
||||
text = bpe_tokenizer.text2tokens(text)
|
||||
text = bpe_tokenizer.text2tokens("".join(text))
|
||||
|
||||
if seg_dict is not None:
|
||||
assert isinstance(seg_dict, dict)
|
||||
|
||||
@ -19,6 +19,7 @@ class ModelExport:
|
||||
self,
|
||||
cache_dir: Union[Path, str] = None,
|
||||
onnx: bool = True,
|
||||
device: str = "cpu",
|
||||
quant: bool = True,
|
||||
fallback_num: int = 0,
|
||||
audio_in: str = None,
|
||||
@ -36,6 +37,7 @@ class ModelExport:
|
||||
)
|
||||
print("output dir: {}".format(self.cache_dir))
|
||||
self.onnx = onnx
|
||||
self.device = device
|
||||
self.quant = quant
|
||||
self.fallback_num = fallback_num
|
||||
self.frontend = None
|
||||
@ -112,6 +114,10 @@ class ModelExport:
|
||||
else:
|
||||
dummy_input = model.get_dummy_inputs()
|
||||
|
||||
if self.device == 'cuda':
|
||||
model = model.cuda()
|
||||
dummy_input = tuple([i.cuda() for i in dummy_input])
|
||||
|
||||
# model_script = torch.jit.script(model)
|
||||
model_script = torch.jit.trace(model, dummy_input)
|
||||
model_script.save(os.path.join(path, f'{model.model_name}.torchscripts'))
|
||||
@ -260,6 +266,7 @@ if __name__ == '__main__':
|
||||
parser.add_argument('--model-name', type=str, required=True)
|
||||
parser.add_argument('--export-dir', type=str, required=True)
|
||||
parser.add_argument('--type', type=str, default='onnx', help='["onnx", "torch"]')
|
||||
parser.add_argument('--device', type=str, default='cpu', help='["cpu", "cuda"]')
|
||||
parser.add_argument('--quantize', type=str2bool, default=False, help='export quantized model')
|
||||
parser.add_argument('--fallback-num', type=int, default=0, help='amp fallback number')
|
||||
parser.add_argument('--audio_in', type=str, default=None, help='["wav", "wav.scp"]')
|
||||
@ -269,6 +276,7 @@ if __name__ == '__main__':
|
||||
export_model = ModelExport(
|
||||
cache_dir=args.export_dir,
|
||||
onnx=args.type == 'onnx',
|
||||
device=args.device,
|
||||
quant=args.quantize,
|
||||
fallback_num=args.fallback_num,
|
||||
audio_in=args.audio_in,
|
||||
|
||||
@ -75,8 +75,8 @@ def preprocess_for_attn(x, mask, cache, pad_fn):
|
||||
return x, cache
|
||||
|
||||
|
||||
torch_version = float(".".join(torch.__version__.split(".")[:2]))
|
||||
if torch_version >= 1.8:
|
||||
torch_version = tuple([int(i) for i in torch.__version__.split(".")[:2]])
|
||||
if torch_version >= (1, 8):
|
||||
import torch.fx
|
||||
torch.fx.wrap('preprocess_for_attn')
|
||||
|
||||
|
||||
@ -74,7 +74,7 @@ class ContextualDecoderLayer(nn.Module):
|
||||
return x, tgt_mask, x_self_attn, x_src_attn
|
||||
|
||||
|
||||
class ContexutalBiasDecoder(nn.Module):
|
||||
class ContextualBiasDecoder(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
size,
|
||||
@ -83,7 +83,7 @@ class ContexutalBiasDecoder(nn.Module):
|
||||
normalize_before=True,
|
||||
):
|
||||
"""Construct an DecoderLayer object."""
|
||||
super(ContexutalBiasDecoder, self).__init__()
|
||||
super(ContextualBiasDecoder, self).__init__()
|
||||
self.size = size
|
||||
self.src_attn = src_attn
|
||||
if src_attn is not None:
|
||||
@ -186,7 +186,7 @@ class ContextualParaformerDecoder(ParaformerSANMDecoder):
|
||||
),
|
||||
)
|
||||
self.dropout = nn.Dropout(dropout_rate)
|
||||
self.bias_decoder = ContexutalBiasDecoder(
|
||||
self.bias_decoder = ContextualBiasDecoder(
|
||||
size=attention_dim,
|
||||
src_attn=MultiHeadedAttentionCrossAtt(
|
||||
attention_heads, attention_dim, src_attention_dropout_rate
|
||||
|
||||
@ -104,7 +104,6 @@ class DecoderLayerSANM(nn.Module):
|
||||
|
||||
x = residual + self.dropout(self.src_attn(x, memory, memory_mask))
|
||||
|
||||
|
||||
return x, tgt_mask, memory, memory_mask, cache
|
||||
|
||||
def forward_chunk(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
|
||||
@ -400,7 +399,7 @@ class FsmnDecoderSCAMAOpt(BaseTransformerDecoder):
|
||||
for i in range(self.att_layer_num):
|
||||
decoder = self.decoders[i]
|
||||
c = cache[i]
|
||||
x, tgt_mask, memory, memory_mask, c_ret = decoder(
|
||||
x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
|
||||
x, tgt_mask, memory, memory_mask, cache=c
|
||||
)
|
||||
new_cache.append(c_ret)
|
||||
@ -410,13 +409,13 @@ class FsmnDecoderSCAMAOpt(BaseTransformerDecoder):
|
||||
j = i + self.att_layer_num
|
||||
decoder = self.decoders2[i]
|
||||
c = cache[j]
|
||||
x, tgt_mask, memory, memory_mask, c_ret = decoder(
|
||||
x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
|
||||
x, tgt_mask, memory, memory_mask, cache=c
|
||||
)
|
||||
new_cache.append(c_ret)
|
||||
|
||||
for decoder in self.decoders3:
|
||||
x, tgt_mask, memory, memory_mask, _ = decoder(
|
||||
x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
|
||||
x, tgt_mask, memory, None, cache=None
|
||||
)
|
||||
|
||||
@ -1077,7 +1076,7 @@ class ParaformerSANMDecoder(BaseTransformerDecoder):
|
||||
for i in range(self.att_layer_num):
|
||||
decoder = self.decoders[i]
|
||||
c = cache[i]
|
||||
x, tgt_mask, memory, memory_mask, c_ret = decoder(
|
||||
x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
|
||||
x, tgt_mask, memory, None, cache=c
|
||||
)
|
||||
new_cache.append(c_ret)
|
||||
@ -1087,14 +1086,14 @@ class ParaformerSANMDecoder(BaseTransformerDecoder):
|
||||
j = i + self.att_layer_num
|
||||
decoder = self.decoders2[i]
|
||||
c = cache[j]
|
||||
x, tgt_mask, memory, memory_mask, c_ret = decoder(
|
||||
x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
|
||||
x, tgt_mask, memory, None, cache=c
|
||||
)
|
||||
new_cache.append(c_ret)
|
||||
|
||||
for decoder in self.decoders3:
|
||||
|
||||
x, tgt_mask, memory, memory_mask, _ = decoder(
|
||||
x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
|
||||
x, tgt_mask, memory, None, cache=None
|
||||
)
|
||||
|
||||
|
||||
@ -370,19 +370,10 @@ class Paraformer(AbsESPnetModel):
|
||||
encoder_out, encoder_out_lens
|
||||
)
|
||||
|
||||
assert encoder_out.size(0) == speech.size(0), (
|
||||
encoder_out.size(),
|
||||
speech.size(0),
|
||||
)
|
||||
assert encoder_out.size(1) <= encoder_out_lens.max(), (
|
||||
encoder_out.size(),
|
||||
encoder_out_lens.max(),
|
||||
)
|
||||
|
||||
if intermediate_outs is not None:
|
||||
return (encoder_out, intermediate_outs), encoder_out_lens
|
||||
|
||||
return encoder_out, encoder_out_lens
|
||||
return encoder_out, torch.tensor([encoder_out.size(1)])
|
||||
|
||||
def calc_predictor(self, encoder_out, encoder_out_lens):
|
||||
|
||||
@ -1034,16 +1025,76 @@ class BiCifParaformer(Paraformer):
|
||||
|
||||
# 1. Encoder
|
||||
encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
|
||||
intermediate_outs = None
|
||||
if isinstance(encoder_out, tuple):
|
||||
intermediate_outs = encoder_out[1]
|
||||
encoder_out = encoder_out[0]
|
||||
|
||||
loss_att, acc_att, cer_att, wer_att = None, None, None, None
|
||||
loss_ctc, cer_ctc = None, None
|
||||
loss_pre = None
|
||||
stats = dict()
|
||||
|
||||
# 1. CTC branch
|
||||
if self.ctc_weight != 0.0:
|
||||
loss_ctc, cer_ctc = self._calc_ctc_loss(
|
||||
encoder_out, encoder_out_lens, text, text_lengths
|
||||
)
|
||||
|
||||
# Collect CTC branch stats
|
||||
stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
|
||||
stats["cer_ctc"] = cer_ctc
|
||||
|
||||
# Intermediate CTC (optional)
|
||||
loss_interctc = 0.0
|
||||
if self.interctc_weight != 0.0 and intermediate_outs is not None:
|
||||
for layer_idx, intermediate_out in intermediate_outs:
|
||||
# we assume intermediate_out has the same length & padding
|
||||
# as those of encoder_out
|
||||
loss_ic, cer_ic = self._calc_ctc_loss(
|
||||
intermediate_out, encoder_out_lens, text, text_lengths
|
||||
)
|
||||
loss_interctc = loss_interctc + loss_ic
|
||||
|
||||
# Collect Intermedaite CTC stats
|
||||
stats["loss_interctc_layer{}".format(layer_idx)] = (
|
||||
loss_ic.detach() if loss_ic is not None else None
|
||||
)
|
||||
stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic
|
||||
|
||||
loss_interctc = loss_interctc / len(intermediate_outs)
|
||||
|
||||
# calculate whole encoder loss
|
||||
loss_ctc = (
|
||||
1 - self.interctc_weight
|
||||
) * loss_ctc + self.interctc_weight * loss_interctc
|
||||
|
||||
# 2b. Attention decoder branch
|
||||
if self.ctc_weight != 1.0:
|
||||
loss_att, acc_att, cer_att, wer_att, loss_pre = self._calc_att_loss(
|
||||
encoder_out, encoder_out_lens, text, text_lengths
|
||||
)
|
||||
|
||||
loss_pre2 = self._calc_pre2_loss(
|
||||
encoder_out, encoder_out_lens, text, text_lengths
|
||||
)
|
||||
|
||||
loss = loss_pre2
|
||||
# 3. CTC-Att loss definition
|
||||
if self.ctc_weight == 0.0:
|
||||
loss = loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5
|
||||
elif self.ctc_weight == 1.0:
|
||||
loss = loss_ctc
|
||||
else:
|
||||
loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5
|
||||
|
||||
# Collect Attn branch stats
|
||||
stats["loss_att"] = loss_att.detach() if loss_att is not None else None
|
||||
stats["acc"] = acc_att
|
||||
stats["cer"] = cer_att
|
||||
stats["wer"] = wer_att
|
||||
stats["loss_pre"] = loss_pre.detach().cpu() if loss_pre is not None else None
|
||||
stats["loss_pre2"] = loss_pre2.detach().cpu()
|
||||
|
||||
stats["loss"] = torch.clone(loss.detach())
|
||||
|
||||
# force_gatherable: to-device and to-tensor if scalar for DataParallel
|
||||
@ -1094,6 +1145,7 @@ class ContextualParaformer(Paraformer):
|
||||
inner_dim: int = 256,
|
||||
bias_encoder_type: str = 'lstm',
|
||||
label_bracket: bool = False,
|
||||
use_decoder_embedding: bool = False,
|
||||
):
|
||||
assert check_argument_types()
|
||||
assert 0.0 <= ctc_weight <= 1.0, ctc_weight
|
||||
@ -1147,6 +1199,7 @@ class ContextualParaformer(Paraformer):
|
||||
self.hotword_buffer = None
|
||||
self.length_record = []
|
||||
self.current_buffer_length = 0
|
||||
self.use_decoder_embedding = use_decoder_embedding
|
||||
|
||||
def forward(
|
||||
self,
|
||||
@ -1288,7 +1341,10 @@ class ContextualParaformer(Paraformer):
|
||||
hw_list.append(hw_tokens)
|
||||
# padding
|
||||
hw_list_pad = pad_list(hw_list, 0)
|
||||
hw_embed = self.decoder.embed(hw_list_pad)
|
||||
if self.use_decoder_embedding:
|
||||
hw_embed = self.decoder.embed(hw_list_pad)
|
||||
else:
|
||||
hw_embed = self.bias_embed(hw_list_pad)
|
||||
hw_embed, (_, _) = self.bias_encoder(hw_embed)
|
||||
_ind = np.arange(0, len(hw_list)).tolist()
|
||||
# update self.hotword_buffer, throw a part if oversize
|
||||
@ -1404,13 +1460,19 @@ class ContextualParaformer(Paraformer):
|
||||
# default hotword list
|
||||
hw_list = [torch.Tensor([self.sos]).long().to(encoder_out.device)] # empty hotword list
|
||||
hw_list_pad = pad_list(hw_list, 0)
|
||||
hw_embed = self.bias_embed(hw_list_pad)
|
||||
if self.use_decoder_embedding:
|
||||
hw_embed = self.decoder.embed(hw_list_pad)
|
||||
else:
|
||||
hw_embed = self.bias_embed(hw_list_pad)
|
||||
_, (h_n, _) = self.bias_encoder(hw_embed)
|
||||
contextual_info = h_n.squeeze(0).repeat(encoder_out.shape[0], 1, 1)
|
||||
else:
|
||||
hw_lengths = [len(i) for i in hw_list]
|
||||
hw_list_pad = pad_list([torch.Tensor(i).long() for i in hw_list], 0).to(encoder_out.device)
|
||||
hw_embed = self.bias_embed(hw_list_pad)
|
||||
if self.use_decoder_embedding:
|
||||
hw_embed = self.decoder.embed(hw_list_pad)
|
||||
else:
|
||||
hw_embed = self.bias_embed(hw_list_pad)
|
||||
hw_embed = torch.nn.utils.rnn.pack_padded_sequence(hw_embed, hw_lengths, batch_first=True,
|
||||
enforce_sorted=False)
|
||||
_, (h_n, _) = self.bias_encoder(hw_embed)
|
||||
|
||||
@ -200,6 +200,7 @@ class CifPredictorV2(nn.Module):
|
||||
return acoustic_embeds, token_num, alphas, cif_peak
|
||||
|
||||
def forward_chunk(self, hidden, cache=None):
|
||||
b, t, d = hidden.size()
|
||||
h = hidden
|
||||
context = h.transpose(1, 2)
|
||||
queries = self.pad(context)
|
||||
@ -220,6 +221,8 @@ class CifPredictorV2(nn.Module):
|
||||
alphas = alphas * mask_chunk_predictor
|
||||
|
||||
if cache is not None:
|
||||
if cache["is_final"]:
|
||||
alphas[:, cache["stride"] + cache["pad_left"] - 1] += 0.45
|
||||
if cache["cif_hidden"] is not None:
|
||||
hidden = torch.cat((cache["cif_hidden"], hidden), 1)
|
||||
if cache["cif_alphas"] is not None:
|
||||
@ -241,7 +244,6 @@ class CifPredictorV2(nn.Module):
|
||||
mask_chunk_peak_predictor[:, :pre_alphas_length] = 1.0
|
||||
mask_chunk_peak_predictor[:, pre_alphas_length + cache["pad_left"]:pre_alphas_length + cache["stride"] + cache["pad_left"]] = 1.0
|
||||
|
||||
|
||||
if mask_chunk_peak_predictor is not None:
|
||||
cif_peak = cif_peak * mask_chunk_peak_predictor.squeeze(-1)
|
||||
|
||||
|
||||
@ -8,7 +8,7 @@
|
||||
|
||||
import math
|
||||
import torch
|
||||
|
||||
import torch.nn.functional as F
|
||||
|
||||
def _pre_hook(
|
||||
state_dict,
|
||||
@ -409,9 +409,18 @@ class SinusoidalPositionEncoder(torch.nn.Module):
|
||||
|
||||
def forward_chunk(self, x, cache=None):
|
||||
start_idx = 0
|
||||
pad_left = 0
|
||||
pad_right = 0
|
||||
batch_size, timesteps, input_dim = x.size()
|
||||
if cache is not None:
|
||||
start_idx = cache["start_idx"]
|
||||
pad_left = cache["left"]
|
||||
pad_right = cache["right"]
|
||||
positions = torch.arange(1, timesteps+start_idx+1)[None, :]
|
||||
position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
|
||||
return x + position_encoding[:, start_idx: start_idx + timesteps]
|
||||
outputs = x + position_encoding[:, start_idx: start_idx + timesteps]
|
||||
outputs = outputs.transpose(1,2)
|
||||
outputs = F.pad(outputs, (pad_left, pad_right))
|
||||
outputs = outputs.transpose(1,2)
|
||||
return outputs
|
||||
|
||||
|
||||
@ -53,6 +53,68 @@ cd ../python/grpc
|
||||
python grpc_main_client_mic.py --host $server_ip --port 10108
|
||||
```
|
||||
|
||||
The `grpc_main_client_mic.py` follows the [original design] (https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc#workflow-in-desgin) by sending audio_data with chunks. If you want to send audio_data in one request, here is an example:
|
||||
|
||||
```
|
||||
# go to ../python/grpc to find this package
|
||||
import paraformer_pb2
|
||||
|
||||
|
||||
class RecognizeStub:
|
||||
def __init__(self, channel):
|
||||
self.Recognize = channel.stream_stream(
|
||||
'/paraformer.ASR/Recognize',
|
||||
request_serializer=paraformer_pb2.Request.SerializeToString,
|
||||
response_deserializer=paraformer_pb2.Response.FromString,
|
||||
)
|
||||
|
||||
|
||||
async def send(channel, data, speaking, isEnd):
|
||||
stub = RecognizeStub(channel)
|
||||
req = paraformer_pb2.Request()
|
||||
if data:
|
||||
req.audio_data = data
|
||||
req.user = 'zz'
|
||||
req.language = 'zh-CN'
|
||||
req.speaking = speaking
|
||||
req.isEnd = isEnd
|
||||
q = queue.SimpleQueue()
|
||||
q.put(req)
|
||||
return stub.Recognize(iter(q.get, None))
|
||||
|
||||
# send the audio data once
|
||||
async def grpc_rec(data, grpc_uri):
|
||||
with grpc.insecure_channel(grpc_uri) as channel:
|
||||
b = time.time()
|
||||
response = await send(channel, data, False, False)
|
||||
resp = response.next()
|
||||
text = ''
|
||||
if 'decoding' == resp.action:
|
||||
resp = response.next()
|
||||
if 'finish' == resp.action:
|
||||
text = json.loads(resp.sentence)['text']
|
||||
response = await send(channel, None, False, True)
|
||||
return {
|
||||
'text': text,
|
||||
'time': time.time() - b,
|
||||
}
|
||||
|
||||
async def test():
|
||||
# fc = FunAsrGrpcClient('127.0.0.1', 9900)
|
||||
# t = await fc.rec(wav.tobytes())
|
||||
# print(t)
|
||||
wav, _ = sf.read('z-10s.wav', dtype='int16')
|
||||
uri = '127.0.0.1:9900'
|
||||
res = await grpc_rec(wav.tobytes(), uri)
|
||||
print(res)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
asyncio.run(test())
|
||||
|
||||
```
|
||||
|
||||
|
||||
## Acknowledge
|
||||
1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
|
||||
2. We acknowledge [DeepScience](https://www.deepscience.cn) for contributing the grpc service.
|
||||
|
||||
@ -88,7 +88,7 @@ grpc::Status ASRServicer::Recognize(
|
||||
res.set_language(req.language());
|
||||
stream->Write(res);
|
||||
} else if (!req.speaking()) {
|
||||
if (client_buffers.count(req.user()) == 0) {
|
||||
if (client_buffers.count(req.user()) == 0 && req.audio_data().size() == 0) {
|
||||
Response res;
|
||||
res.set_sentence(
|
||||
R"({"success": true, "detail": "waiting_for_voice"})"
|
||||
@ -99,14 +99,18 @@ grpc::Status ASRServicer::Recognize(
|
||||
stream->Write(res);
|
||||
}else {
|
||||
auto begin_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
||||
if (req.audio_data().size() > 0) {
|
||||
auto& buf = client_buffers[req.user()];
|
||||
buf.insert(buf.end(), req.audio_data().begin(), req.audio_data().end());
|
||||
}
|
||||
std::string tmp_data = this->client_buffers[req.user()];
|
||||
this->clear_states(req.user());
|
||||
|
||||
|
||||
Response res;
|
||||
res.set_sentence(
|
||||
R"({"success": true, "detail": "decoding data: " + std::to_string(tmp_data.length()) + " bytes"})"
|
||||
);
|
||||
int data_len_int = tmp_data.length();
|
||||
int data_len_int = tmp_data.length();
|
||||
std::string data_len = std::to_string(data_len_int);
|
||||
std::stringstream ss;
|
||||
ss << R"({"success": true, "detail": "decoding data: )" << data_len << R"( bytes")" << R"("})";
|
||||
@ -129,18 +133,18 @@ grpc::Status ASRServicer::Recognize(
|
||||
res.set_user(req.user());
|
||||
res.set_action("finish");
|
||||
res.set_language(req.language());
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
stream->Write(res);
|
||||
}
|
||||
else {
|
||||
RPASR_RESULT Result= RapidAsrRecogPCMBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, RASR_NONE, NULL);
|
||||
RPASR_RESULT Result= RapidAsrRecogPCMBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, RASR_NONE, NULL);
|
||||
std::string asr_result = ((RPASR_RECOG_RESULT*)Result)->msg;
|
||||
|
||||
auto end_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
|
||||
std::string delay_str = std::to_string(end_time - begin_time);
|
||||
|
||||
|
||||
std::cout << "user: " << req.user() << " , delay(ms): " << delay_str << ", text: " << asr_result << std::endl;
|
||||
Response res;
|
||||
std::stringstream ss;
|
||||
@ -150,8 +154,8 @@ grpc::Status ASRServicer::Recognize(
|
||||
res.set_user(req.user());
|
||||
res.set_action("finish");
|
||||
res.set_language(req.language());
|
||||
|
||||
|
||||
|
||||
|
||||
stream->Write(res);
|
||||
}
|
||||
}
|
||||
@ -165,7 +169,7 @@ grpc::Status ASRServicer::Recognize(
|
||||
res.set_language(req.language());
|
||||
stream->Write(res);
|
||||
}
|
||||
}
|
||||
}
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
|
||||
@ -109,7 +109,7 @@ class ASRServicer(paraformer_pb2_grpc.ASRServicer):
|
||||
else:
|
||||
asr_result = ""
|
||||
elif self.backend == "onnxruntime":
|
||||
from rapid_paraformer.utils.frontend import load_bytes
|
||||
from funasr_onnx.utils.frontend import load_bytes
|
||||
array = load_bytes(tmp_data)
|
||||
asr_result = self.inference_16k_pipeline(array)[0]
|
||||
end_time = int(round(time.time() * 1000))
|
||||
|
||||
@ -31,7 +31,7 @@
|
||||
|
||||
```shell
|
||||
git clone https://github.com/alibaba/FunASR.git && cd FunASR
|
||||
cd funasr/runtime/python/funasr_torch
|
||||
cd funasr/runtime/python/libtorch
|
||||
python setup.py build
|
||||
python setup.py install
|
||||
```
|
||||
|
||||
@ -1,10 +1,15 @@
|
||||
|
||||
from funasr_torch import Paraformer
|
||||
|
||||
model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
model = Paraformer(model_dir, batch_size=1)
|
||||
|
||||
wav_path = ['/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
|
||||
model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
|
||||
model = Paraformer(model_dir, batch_size=1) # cpu
|
||||
# model = Paraformer(model_dir, batch_size=1, device_id=0) # gpu
|
||||
|
||||
# when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps
|
||||
# model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png")
|
||||
|
||||
wav_path = "YourPath/xx.wav"
|
||||
|
||||
result = model(wav_path)
|
||||
print(result)
|
||||
print(result)
|
||||
|
||||
@ -46,6 +46,7 @@ class Paraformer():
|
||||
)
|
||||
self.ort_infer = torch.jit.load(model_file)
|
||||
self.batch_size = batch_size
|
||||
self.device_id = device_id
|
||||
self.plot_timestamp_to = plot_timestamp_to
|
||||
self.pred_bias = pred_bias
|
||||
|
||||
@ -58,8 +59,13 @@ class Paraformer():
|
||||
end_idx = min(waveform_nums, beg_idx + self.batch_size)
|
||||
feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
|
||||
try:
|
||||
outputs = self.ort_infer(feats, feats_len)
|
||||
am_scores, valid_token_lens = outputs[0], outputs[1]
|
||||
with torch.no_grad():
|
||||
if int(self.device_id) == -1:
|
||||
outputs = self.ort_infer(feats, feats_len)
|
||||
am_scores, valid_token_lens = outputs[0], outputs[1]
|
||||
else:
|
||||
outputs = self.ort_infer(feats.cuda(), feats_len.cuda())
|
||||
am_scores, valid_token_lens = outputs[0].cpu(), outputs[1].cpu()
|
||||
if len(outputs) == 4:
|
||||
# for BiCifParaformer Inference
|
||||
us_alphas, us_peaks = outputs[2], outputs[3]
|
||||
|
||||
@ -32,7 +32,7 @@ or install from source code
|
||||
|
||||
```shell
|
||||
git clone https://github.com/alibaba/FunASR.git && cd FunASR
|
||||
cd funasr/runtime/python/funasr_onnx
|
||||
cd funasr/runtime/python/onnxruntime
|
||||
python setup.py build
|
||||
python setup.py install
|
||||
```
|
||||
|
||||
@ -1,13 +1,15 @@
|
||||
|
||||
from funasr_onnx import Paraformer
|
||||
|
||||
model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch"
|
||||
|
||||
# if you use paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch, you should set pred_bias=0
|
||||
# plot_timestamp_to works only when using speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch
|
||||
model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0)
|
||||
model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
|
||||
|
||||
wav_path = "/Users/shixian/code/funasr/export/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/example/asr_example.wav"
|
||||
model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0) # cpu
|
||||
# model = Paraformer(model_dir, batch_size=2, plot_timestamp_to="./", pred_bias=0, device_id=0) # gpu
|
||||
|
||||
# when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps
|
||||
# model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png")
|
||||
|
||||
wav_path = "YourPath/xx.wav"
|
||||
|
||||
result = model(wav_path)
|
||||
print(result)
|
||||
print(result)
|
||||
|
||||
@ -464,6 +464,12 @@ class AbsTask(ABC):
|
||||
default=sys.maxsize,
|
||||
help="The maximum number update step to train",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch_interval",
|
||||
type=int,
|
||||
default=10000,
|
||||
help="The batch interval for saving model.",
|
||||
)
|
||||
group.add_argument(
|
||||
"--patience",
|
||||
type=int_or_none,
|
||||
@ -1355,15 +1361,15 @@ class AbsTask(ABC):
|
||||
from funasr.datasets.large_datasets.build_dataloader import ArkDataLoader
|
||||
train_iter_factory = ArkDataLoader(args.train_data_file, args.token_list, args.dataset_conf,
|
||||
frontend_conf=args.frontend_conf if hasattr(args, "frontend_conf") else None,
|
||||
seg_dict_file=args.seg_dict_file if hasattr(args,
|
||||
"seg_dict_file") else None,
|
||||
seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
|
||||
punc_dict_file=args.punc_list if hasattr(args, "punc_list") else None,
|
||||
bpemodel_file=args.bpemodel if hasattr(args, "bpemodel") else None,
|
||||
mode="train")
|
||||
valid_iter_factory = ArkDataLoader(args.valid_data_file, args.token_list, args.dataset_conf,
|
||||
frontend_conf=args.frontend_conf if hasattr(args, "frontend_conf") else None,
|
||||
seg_dict_file=args.seg_dict_file if hasattr(args,
|
||||
"seg_dict_file") else None,
|
||||
seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
|
||||
punc_dict_file=args.punc_list if hasattr(args, "punc_list") else None,
|
||||
bpemodel_file=args.bpemodel if hasattr(args, "bpemodel") else None,
|
||||
mode="eval")
|
||||
elif args.dataset_type == "small":
|
||||
train_iter_factory = cls.build_iter_factory(
|
||||
@ -1576,13 +1582,18 @@ class AbsTask(ABC):
|
||||
) -> AbsIterFactory:
|
||||
assert check_argument_types()
|
||||
|
||||
if args.frontend_conf is not None and "fs" in args.frontend_conf:
|
||||
dest_sample_rate = args.frontend_conf["fs"]
|
||||
else:
|
||||
dest_sample_rate = 16000
|
||||
|
||||
dataset = ESPnetDataset(
|
||||
iter_options.data_path_and_name_and_type,
|
||||
float_dtype=args.train_dtype,
|
||||
preprocess=iter_options.preprocess_fn,
|
||||
max_cache_size=iter_options.max_cache_size,
|
||||
max_cache_fd=iter_options.max_cache_fd,
|
||||
dest_sample_rate=args.frontend_conf["fs"],
|
||||
dest_sample_rate=dest_sample_rate,
|
||||
)
|
||||
cls.check_task_requirements(
|
||||
dataset, args.allow_variable_data_keys, train=iter_options.train
|
||||
|
||||
@ -412,12 +412,6 @@ class ASRTask(AbsTask):
|
||||
default="13_15",
|
||||
help="The range of noise decibel level.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--batch_interval",
|
||||
type=int,
|
||||
default=10000,
|
||||
help="The batch interval for saving model.",
|
||||
)
|
||||
|
||||
for class_choices in cls.class_choices_list:
|
||||
# Append --<name> and --<name>_conf.
|
||||
|
||||
@ -579,9 +579,10 @@ class Trainer:
|
||||
reporter.measure_iter_time(iterator, "iter_time"), 1
|
||||
):
|
||||
assert isinstance(batch, dict), type(batch)
|
||||
|
||||
if rank == 0 and hasattr(model.module, "num_updates"):
|
||||
num_batch_updates = model.module.get_num_updates()
|
||||
|
||||
if rank == 0:
|
||||
if hasattr(model, "num_updates") or (hasattr(model, "module") and hasattr(model.module, "num_updates")):
|
||||
num_batch_updates = model.get_num_updates() if hasattr(model,"num_updates") else model.module.get_num_updates()
|
||||
if (num_batch_updates%batch_interval == 0) and (options.oss_bucket is not None) and options.use_pai:
|
||||
buffer = BytesIO()
|
||||
torch.save(model.state_dict(), buffer)
|
||||
|
||||
@ -45,8 +45,8 @@ def compute_wer(ref_file,
|
||||
if out_item['wrong'] > 0:
|
||||
rst['wrong_sentences'] += 1
|
||||
cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
|
||||
cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
|
||||
cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
|
||||
cer_detail_writer.write("ref:" + '\t' + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + '\n')
|
||||
cer_detail_writer.write("hyp:" + '\t' + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + '\n')
|
||||
|
||||
if rst['Wrd'] > 0:
|
||||
rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
|
||||
|
||||
@ -1 +1 @@
|
||||
0.3.2
|
||||
0.3.3
|
||||
|
||||
Loading…
Reference in New Issue
Block a user