From 5715decf4492ae97ee529339e0c59a8e3fdabfa8 Mon Sep 17 00:00:00 2001 From: zhifu gao Date: Wed, 22 May 2024 10:46:49 +0800 Subject: [PATCH] Dev gzf deepspeed (#1745) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * resume from step * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * batch * train_loss_avg train_acc_avg * train_loss_avg train_acc_avg * train_loss_avg train_acc_avg * log step * wav is not exist * wav is not exist * decoding * decoding * decoding * wechat * decoding key * decoding key * decoding key * decoding key * decoding key * decoding key * dynamic batch * start_data_split_i=0 * total_time/accum_grad * total_time/accum_grad * total_time/accum_grad * update avg slice * update avg slice * sensevoice sanm * sensevoice sanm * add * add * add * add * deepspeed * update with main (#1731) * c++ runtime adapt to 1.0 (#1724) * adapt vad runtime to 1.0 * add json * change yml name * add func LoadVocabFromJson * add token file for InitAsr * add token path for OfflineStream * add funcOpenYaml * add token file for InitPunc * add token file for stream * update punc-model * update funasr-wss-server * update runtime_sdk_download_tool.py * update docker list * Delete docs/images/wechat.png * Add files via upload * Emo2Vec限定选择的情感类别 (#1730) * 限定选择的情感类别 * 使用none来禁用情感标签输出 * 修改输出接口 * 使用unuse来禁用token --------- Co-authored-by: 常材 * bugfix * v1.0.27 * update docs * hf hub * Fix incorrect assignment of 'end' attribute to 'start' in sentences list comprehension (#1680) --------- Co-authored-by: Yabin Li Co-authored-by: gaochangfeng <54253717+gaochangfeng@users.noreply.github.com> Co-authored-by: 常材 Co-authored-by: nsdou <168500039+nsdou@users.noreply.github.com> * docs * docs * deepspeed * deepspeed * deepspeed * deepspeed * update * ds * ds * ds * ds * ds * ds * ds * add * add * bugfix * add --------- Co-authored-by: Yabin Li Co-authored-by: gaochangfeng <54253717+gaochangfeng@users.noreply.github.com> Co-authored-by: 常材 Co-authored-by: nsdou <168500039+nsdou@users.noreply.github.com> --- examples/wenetspeech/transformer/README.md | 16 ++ .../conf/transformer_12e_6d_2048_256.yaml | 104 +++++++++ .../wenetspeech/transformer/demo_infer.sh | 1 + .../transformer/demo_train_or_finetune.sh | 1 + .../transformer/local/aishell_data_prep.sh | 66 ++++++ .../transformer/local/download_and_untar.sh | 105 +++++++++ examples/wenetspeech/transformer/run.sh | 203 ++++++++++++++++++ examples/wenetspeech/transformer/utils | 1 + 8 files changed, 497 insertions(+) create mode 100644 examples/wenetspeech/transformer/README.md create mode 100644 examples/wenetspeech/transformer/conf/transformer_12e_6d_2048_256.yaml create mode 120000 examples/wenetspeech/transformer/demo_infer.sh create mode 120000 examples/wenetspeech/transformer/demo_train_or_finetune.sh create mode 100755 examples/wenetspeech/transformer/local/aishell_data_prep.sh create mode 100755 examples/wenetspeech/transformer/local/download_and_untar.sh create mode 100755 examples/wenetspeech/transformer/run.sh create mode 120000 examples/wenetspeech/transformer/utils diff --git a/examples/wenetspeech/transformer/README.md b/examples/wenetspeech/transformer/README.md new file mode 100644 index 000000000..2435b553b --- /dev/null +++ b/examples/wenetspeech/transformer/README.md @@ -0,0 +1,16 @@ + +# Conformer Result + +## Training Config +- Feature info: using 80 dims fbank, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment +- Train info: lr 5e-4, batch_size 25000, 2 gpu(Tesla V100), acc_grad 1, 50 epochs +- Train config: conf/train_asr_transformer.yaml +- LM config: LM was not used +- Model size: 46M + +## Results (CER) + +| testset | CER(%) | +|:-----------:|:------:| +| dev | 4.97 | +| test | 5.37 | \ No newline at end of file diff --git a/examples/wenetspeech/transformer/conf/transformer_12e_6d_2048_256.yaml b/examples/wenetspeech/transformer/conf/transformer_12e_6d_2048_256.yaml new file mode 100644 index 000000000..efcf593a5 --- /dev/null +++ b/examples/wenetspeech/transformer/conf/transformer_12e_6d_2048_256.yaml @@ -0,0 +1,104 @@ +# This is an example that demonstrates how to configure a model file. +# You can modify the configuration according to your own requirements. + +# to print the register_table: +# from funasr.register import tables +# tables.print() + +# network architecture +model: Transformer +model_conf: + ctc_weight: 0.3 + lsm_weight: 0.1 # label smoothing option + length_normalized_loss: false + +# encoder +encoder: TransformerEncoder +encoder_conf: + output_size: 256 # dimension of attention + attention_heads: 4 + linear_units: 2048 # the number of units of position-wise feed forward + num_blocks: 12 # the number of encoder blocks + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + attention_dropout_rate: 0.0 + input_layer: conv2d # encoder architecture type + normalize_before: true + +# decoder +decoder: TransformerDecoder +decoder_conf: + attention_heads: 4 + linear_units: 2048 + num_blocks: 6 + dropout_rate: 0.1 + positional_dropout_rate: 0.1 + self_attention_dropout_rate: 0.0 + src_attention_dropout_rate: 0.0 + + +# frontend related +frontend: WavFrontend +frontend_conf: + fs: 16000 + window: hamming + n_mels: 80 + frame_length: 25 + frame_shift: 10 + lfr_m: 1 + lfr_n: 1 + +specaug: SpecAug +specaug_conf: + apply_time_warp: true + time_warp_window: 5 + time_warp_mode: bicubic + apply_freq_mask: true + freq_mask_width_range: + - 0 + - 30 + num_freq_mask: 2 + apply_time_mask: true + time_mask_width_range: + - 0 + - 40 + num_time_mask: 2 + +train_conf: + accum_grad: 1 + grad_clip: 5 + max_epoch: 150 + keep_nbest_models: 10 + log_interval: 50 + +optim: adam +optim_conf: + lr: 0.002 +scheduler: warmuplr +scheduler_conf: + warmup_steps: 30000 + +dataset: AudioDataset +dataset_conf: + index_ds: IndexDSJsonl + batch_sampler: EspnetStyleBatchSampler + batch_type: length # example or length + batch_size: 25000 # if batch_type is example, batch_size is the numbers of samples; if length, batch_size is source_token_len+target_token_len; + max_token_length: 2048 # filter samples if source_token_len+target_token_len > max_token_length, + buffer_size: 1024 + shuffle: True + num_workers: 4 + preprocessor_speech: SpeechPreprocessSpeedPerturb + preprocessor_speech_conf: + speed_perturb: [0.9, 1.0, 1.1] + +tokenizer: CharTokenizer +tokenizer_conf: + unk_symbol: + +ctc_conf: + dropout_rate: 0.0 + ctc_type: builtin + reduce: true + ignore_nan_grad: true +normalize: null diff --git a/examples/wenetspeech/transformer/demo_infer.sh b/examples/wenetspeech/transformer/demo_infer.sh new file mode 120000 index 000000000..9d0a7a9e3 --- /dev/null +++ b/examples/wenetspeech/transformer/demo_infer.sh @@ -0,0 +1 @@ +../paraformer/demo_infer.sh \ No newline at end of file diff --git a/examples/wenetspeech/transformer/demo_train_or_finetune.sh b/examples/wenetspeech/transformer/demo_train_or_finetune.sh new file mode 120000 index 000000000..bbabdbe84 --- /dev/null +++ b/examples/wenetspeech/transformer/demo_train_or_finetune.sh @@ -0,0 +1 @@ +../paraformer/demo_train_or_finetune.sh \ No newline at end of file diff --git a/examples/wenetspeech/transformer/local/aishell_data_prep.sh b/examples/wenetspeech/transformer/local/aishell_data_prep.sh new file mode 100755 index 000000000..83f489b3c --- /dev/null +++ b/examples/wenetspeech/transformer/local/aishell_data_prep.sh @@ -0,0 +1,66 @@ +#!/bin/bash + +# Copyright 2017 Xingyu Na +# Apache 2.0 + +#. ./path.sh || exit 1; + +if [ $# != 3 ]; then + echo "Usage: $0 " + echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript data" + exit 1; +fi + +aishell_audio_dir=$1 +aishell_text=$2/aishell_transcript_v0.8.txt +output_dir=$3 + +train_dir=$output_dir/data/local/train +dev_dir=$output_dir/data/local/dev +test_dir=$output_dir/data/local/test +tmp_dir=$output_dir/data/local/tmp + +mkdir -p $train_dir +mkdir -p $dev_dir +mkdir -p $test_dir +mkdir -p $tmp_dir + +# data directory check +if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then + echo "Error: $0 requires two directory arguments" + exit 1; +fi + +# find wav audio file for train, dev and test resp. +find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist +n=`cat $tmp_dir/wav.flist | wc -l` +[ $n -ne 141925 ] && \ + echo Warning: expected 141925 data data files, found $n + +grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1; +grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1; +grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1; + +rm -r $tmp_dir + +# Transcriptions preparation +for dir in $train_dir $dev_dir $test_dir; do + echo Preparing $dir transcriptions + sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list + paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all + utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt + awk '{print $1}' $dir/transcripts.txt > $dir/utt.list + utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp + sort -u $dir/transcripts.txt > $dir/text +done + +mkdir -p $output_dir/data/train $output_dir/data/dev $output_dir/data/test + +for f in wav.scp text; do + cp $train_dir/$f $output_dir/data/train/$f || exit 1; + cp $dev_dir/$f $output_dir/data/dev/$f || exit 1; + cp $test_dir/$f $output_dir/data/test/$f || exit 1; +done + +echo "$0: AISHELL data preparation succeeded" +exit 0; diff --git a/examples/wenetspeech/transformer/local/download_and_untar.sh b/examples/wenetspeech/transformer/local/download_and_untar.sh new file mode 100755 index 000000000..d98255915 --- /dev/null +++ b/examples/wenetspeech/transformer/local/download_and_untar.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash + +# Copyright 2014 Johns Hopkins University (author: Daniel Povey) +# 2017 Xingyu Na +# Apache 2.0 + +remove_archive=false + +if [ "$1" == --remove-archive ]; then + remove_archive=true + shift +fi + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--remove-archive] " + echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell" + echo "With --remove-archive it will remove the archive after successfully un-tarring it." + echo " can be one of: data_aishell, resource_aishell." +fi + +data=$1 +url=$2 +part=$3 + +if [ ! -d "$data" ]; then + echo "$0: no such directory $data" + exit 1; +fi + +part_ok=false +list="data_aishell resource_aishell" +for x in $list; do + if [ "$part" == $x ]; then part_ok=true; fi +done +if ! $part_ok; then + echo "$0: expected to be one of $list, but got '$part'" + exit 1; +fi + +if [ -z "$url" ]; then + echo "$0: empty URL base." + exit 1; +fi + +if [ -f $data/$part/.complete ]; then + echo "$0: data part $part was already successfully extracted, nothing to do." + exit 0; +fi + +# sizes of the archive files in bytes. +sizes="15582913665 1246920" + +if [ -f $data/$part.tgz ]; then + size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') + size_ok=false + for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done + if ! $size_ok; then + echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" + echo "does not equal the size of one of the archives." + rm $data/$part.tgz + else + echo "$data/$part.tgz exists and appears to be complete." + fi +fi + +if [ ! -f $data/$part.tgz ]; then + if ! command -v wget >/dev/null; then + echo "$0: wget is not installed." + exit 1; + fi + full_url=$url/$part.tgz + echo "$0: downloading data from $full_url. This may take some time, please be patient." + + cd $data || exit 1 + if ! wget --no-check-certificate $full_url; then + echo "$0: error executing wget $full_url" + exit 1; + fi +fi + +cd $data || exit 1 + +if ! tar -xvzf $part.tgz; then + echo "$0: error un-tarring archive $data/$part.tgz" + exit 1; +fi + +touch $data/$part/.complete + +if [ $part == "data_aishell" ]; then + cd $data/$part/wav || exit 1 + for wav in ./*.tar.gz; do + echo "Extracting wav from $wav" + tar -zxf $wav && rm $wav + done +fi + +echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" + +if $remove_archive; then + echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." + rm $data/$part.tgz +fi + +exit 0; diff --git a/examples/wenetspeech/transformer/run.sh b/examples/wenetspeech/transformer/run.sh new file mode 100755 index 000000000..3fb846519 --- /dev/null +++ b/examples/wenetspeech/transformer/run.sh @@ -0,0 +1,203 @@ +#!/usr/bin/env bash + + +CUDA_VISIBLE_DEVICES="0,1" + +# general configuration +feats_dir="../DATA" #feature output dictionary +exp_dir=`pwd` +lang=zh +token_type=char +stage=0 +stop_stage=5 + +# feature configuration +nj=32 + +inference_device="cuda" #"cpu" +inference_checkpoint="model.pt.avg10" +inference_scp="wav.scp" +inference_batch_size=1 + +# data +raw_data=../raw_data +data_url=www.openslr.org/resources/33 + +# exp tag +tag="exp1" +workspace=`pwd` + +master_port=12345 + +. utils/parse_options.sh || exit 1; + +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +train_set=train +valid_set=dev +test_sets="dev test" + +config=transformer_12e_6d_2048_256.yaml +model_dir="baseline_$(basename "${config}" .yaml)_${lang}_${token_type}_${tag}" + + + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + echo "stage -1: Data Download" + mkdir -p ${raw_data} + local/download_and_untar.sh ${raw_data} ${data_url} data_aishell + local/download_and_untar.sh ${raw_data} ${data_url} resource_aishell +fi + +if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then + echo "stage 0: Data preparation" + # Data preparation + local/aishell_data_prep.sh ${raw_data}/data_aishell/wav ${raw_data}/data_aishell/transcript ${feats_dir} + for x in train dev test; do + cp ${feats_dir}/data/${x}/text ${feats_dir}/data/${x}/text.org + paste -d " " <(cut -f 1 -d" " ${feats_dir}/data/${x}/text.org) <(cut -f 2- -d" " ${feats_dir}/data/${x}/text.org | tr -d " ") \ + > ${feats_dir}/data/${x}/text + utils/text2token.py -n 1 -s 1 ${feats_dir}/data/${x}/text > ${feats_dir}/data/${x}/text.org + mv ${feats_dir}/data/${x}/text.org ${feats_dir}/data/${x}/text + + # convert wav.scp text to jsonl + scp_file_list_arg="++scp_file_list='[\"${feats_dir}/data/${x}/wav.scp\",\"${feats_dir}/data/${x}/text\"]'" + python ../../../funasr/datasets/audio_datasets/scp2jsonl.py \ + ++data_type_list='["source", "target"]' \ + ++jsonl_file_out=${feats_dir}/data/${x}/audio_datasets.jsonl \ + ${scp_file_list_arg} + done +fi + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + echo "stage 1: Feature and CMVN Generation" + python ../../../funasr/bin/compute_audio_cmvn.py \ + --config-path "${workspace}/conf" \ + --config-name "${config}" \ + ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \ + ++cmvn_file="${feats_dir}/data/${train_set}/cmvn.json" \ + +fi + +token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt +echo "dictionary: ${token_list}" +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + echo "stage 2: Dictionary Preparation" + mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/ + + echo "make a dictionary" + echo "" > ${token_list} + echo "" >> ${token_list} + echo "" >> ${token_list} + utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/$train_set/text | cut -f 2- -d" " | tr " " "\n" \ + | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list} + echo "" >> ${token_list} +fi + +# LM Training Stage +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + echo "stage 3: LM Training" +fi + +# ASR Training Stage +if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then + echo "stage 4: ASR Training" + + mkdir -p ${exp_dir}/exp/${model_dir} + current_time=$(date "+%Y-%m-%d_%H-%M") + log_file="${exp_dir}/exp/${model_dir}/train.log.txt.${current_time}" + echo "log_file: ${log_file}" + + export CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES + gpu_num=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') + torchrun \ + --nnodes 1 \ + --nproc_per_node ${gpu_num} \ + --master_port ${master_port} \ + ../../../funasr/bin/train.py \ + --config-path "${workspace}/conf" \ + --config-name "${config}" \ + ++train_data_set_list="${feats_dir}/data/${train_set}/audio_datasets.jsonl" \ + ++valid_data_set_list="${feats_dir}/data/${valid_set}/audio_datasets.jsonl" \ + ++tokenizer_conf.token_list="${token_list}" \ + ++frontend_conf.cmvn_file="${feats_dir}/data/${train_set}/am.mvn" \ + ++output_dir="${exp_dir}/exp/${model_dir}" &> ${log_file} +fi + + + +# Testing Stage +if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then + echo "stage 5: Inference" + + if [ ${inference_device} == "cuda" ]; then + nj=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') + else + inference_batch_size=1 + CUDA_VISIBLE_DEVICES="" + for JOB in $(seq ${nj}); do + CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"-1," + done + fi + + for dset in ${test_sets}; do + + inference_dir="${exp_dir}/exp/${model_dir}/inference-${inference_checkpoint}/${dset}" + _logdir="${inference_dir}/logdir" + echo "inference_dir: ${inference_dir}" + + mkdir -p "${_logdir}" + data_dir="${feats_dir}/data/${dset}" + key_file=${data_dir}/${inference_scp} + + split_scps= + for JOB in $(seq "${nj}"); do + split_scps+=" ${_logdir}/keys.${JOB}.scp" + done + utils/split_scp.pl "${key_file}" ${split_scps} + + gpuid_list_array=(${CUDA_VISIBLE_DEVICES//,/ }) + for JOB in $(seq ${nj}); do + { + id=$((JOB-1)) + gpuid=${gpuid_list_array[$id]} + + export CUDA_VISIBLE_DEVICES=${gpuid} + python ../../../funasr/bin/inference.py \ + --config-path="${exp_dir}/exp/${model_dir}" \ + --config-name="config.yaml" \ + ++init_param="${exp_dir}/exp/${model_dir}/${inference_checkpoint}" \ + ++tokenizer_conf.token_list="${token_list}" \ + ++frontend_conf.cmvn_file="${feats_dir}/data/${train_set}/am.mvn" \ + ++input="${_logdir}/keys.${JOB}.scp" \ + ++output_dir="${inference_dir}/${JOB}" \ + ++device="${inference_device}" \ + ++ncpu=1 \ + ++disable_log=true \ + ++batch_size="${inference_batch_size}" &> ${_logdir}/log.${JOB}.txt + }& + + done + wait + + mkdir -p ${inference_dir}/1best_recog + for f in token score text; do + if [ -f "${inference_dir}/${JOB}/1best_recog/${f}" ]; then + for JOB in $(seq "${nj}"); do + cat "${inference_dir}/${JOB}/1best_recog/${f}" + done | sort -k1 >"${inference_dir}/1best_recog/${f}" + fi + done + + echo "Computing WER ..." + python utils/postprocess_text_zh.py ${inference_dir}/1best_recog/text ${inference_dir}/1best_recog/text.proc + python utils/postprocess_text_zh.py ${data_dir}/text ${inference_dir}/1best_recog/text.ref + python utils/compute_wer.py ${inference_dir}/1best_recog/text.ref ${inference_dir}/1best_recog/text.proc ${inference_dir}/1best_recog/text.cer + tail -n 3 ${inference_dir}/1best_recog/text.cer + done + +fi \ No newline at end of file diff --git a/examples/wenetspeech/transformer/utils b/examples/wenetspeech/transformer/utils new file mode 120000 index 000000000..1f2ce9d8f --- /dev/null +++ b/examples/wenetspeech/transformer/utils @@ -0,0 +1 @@ +../paraformer/utils \ No newline at end of file