From b730b382e4fd6628bd0face169dae3d03167b1d6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=98=89=E6=B8=8A?= Date: Fri, 26 May 2023 10:09:30 +0800 Subject: [PATCH] update repo --- egs/wenetspeech/conformer/run.sh | 121 +++++++++++++++++++++++++++++++ 1 file changed, 121 insertions(+) diff --git a/egs/wenetspeech/conformer/run.sh b/egs/wenetspeech/conformer/run.sh index 9f6e4ae18..326440fee 100644 --- a/egs/wenetspeech/conformer/run.sh +++ b/egs/wenetspeech/conformer/run.sh @@ -99,3 +99,124 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list} echo "" >> ${token_list} fi + +# LM Training Stage +world_size=$gpu_num # run on one machine +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "stage 3: LM Training" +fi + +# ASR Training Stage +world_size=$gpu_num # run on one machine +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "stage 4: ASR Training" + mkdir -p ${exp_dir}/exp/${model_dir} + mkdir -p ${exp_dir}/exp/${model_dir}/log + INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init + if [ -f $INIT_FILE ];then + rm -f $INIT_FILE + fi + init_method=file://$(readlink -f $INIT_FILE) + echo "$0: init method is $init_method" + for ((i = 0; i < $gpu_num; ++i)); do + { + rank=$i + local_rank=$i + gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1]) + train.py \ + --task_name asr \ + --gpu_id $gpu_id \ + --use_preprocessor true \ + --token_type $token_type \ + --token_list $token_list \ + --data_dir ${feats_dir}/data \ + --train_set ${train_set} \ + --valid_set ${valid_set} \ + --data_file_names "wav.scp,text" \ + --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \ + --speed_perturb ${speed_perturb} \ + --resume true \ + --output_dir ${exp_dir}/exp/${model_dir} \ + --config $asr_config \ + --ngpu $gpu_num \ + --num_worker_count $count \ + --dist_init_method $init_method \ + --dist_world_size $world_size \ + --dist_rank $rank \ + --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1 + } & + done + wait +fi + +# Testing Stage +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + echo "stage 5: Inference" + for dset in ${test_sets}; do + asr_exp=${exp_dir}/exp/${model_dir} + inference_tag="$(basename "${inference_config}" .yaml)" + _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}" + _logdir="${_dir}/logdir" + if [ -d ${_dir} ]; then + echo "${_dir} is already exists. if you want to decode again, please delete this dir first." + exit 0 + fi + mkdir -p "${_logdir}" + _data="${feats_dir}/data/${dset}" + key_file=${_data}/${scp} + num_scp_file="$(<${key_file} wc -l)" + _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file") + split_scps= + for n in $(seq "${_nj}"); do + split_scps+=" ${_logdir}/keys.${n}.scp" + done + # shellcheck disable=SC2086 + utils/split_scp.pl "${key_file}" ${split_scps} + _opts= + if [ -n "${inference_config}" ]; then + _opts+="--config ${inference_config} " + fi + ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \ + python -m funasr.bin.asr_inference_launch \ + --batch_size 1 \ + --ngpu "${_ngpu}" \ + --njob ${njob} \ + --gpuid_list ${gpuid_list} \ + --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \ + --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \ + --key_file "${_logdir}"/keys.JOB.scp \ + --asr_train_config "${asr_exp}"/config.yaml \ + --asr_model_file "${asr_exp}"/"${inference_asr_model}" \ + --output_dir "${_logdir}"/output.JOB \ + --mode asr \ + ${_opts} + + for f in token token_int score text; do + if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then + for i in $(seq "${_nj}"); do + cat "${_logdir}/output.${i}/1best_recog/${f}" + done | sort -k1 >"${_dir}/${f}" + fi + done + python utils/proce_text.py ${_dir}/text ${_dir}/text.proc + python utils/proce_text.py ${_data}/text ${_data}/text.proc + python utils/compute_wer.py ${_data}/text.proc ${_dir}/text.proc ${_dir}/text.cer + tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt + cat ${_dir}/text.cer.txt + done +fi + +# Prepare files for ModelScope fine-tuning and inference +if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + echo "stage 6: ModelScope Preparation" + cp ${feats_dir}/data/${train_set}/cmvn/am.mvn ${exp_dir}/exp/${model_dir}/am.mvn + vocab_size=$(cat ${token_list} | wc -l) + python utils/gen_modelscope_configuration.py \ + --am_model_name $inference_asr_model \ + --mode asr \ + --model_name conformer \ + --dataset aishell \ + --output_dir $exp_dir/exp/$model_dir \ + --vocab_size $vocab_size \ + --tag $tag +fi \ No newline at end of file