From fc54f4c6acd95e72173fd47909dd114ed30f4801 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=98=89=E6=B8=8A?= Date: Tue, 25 Apr 2023 17:44:14 +0800 Subject: [PATCH] update --- .../paraformer/local/download_and_untar.sh | 105 ++++++++++++++++++ egs/aishell/paraformer/local/prepare_data.sh | 53 --------- egs/aishell/paraformer/run.sh | 62 +++-------- egs/aishell/transformer/utils/compute_cmvn.sh | 22 ++-- 4 files changed, 132 insertions(+), 110 deletions(-) create mode 100755 egs/aishell/paraformer/local/download_and_untar.sh delete mode 100755 egs/aishell/paraformer/local/prepare_data.sh diff --git a/egs/aishell/paraformer/local/download_and_untar.sh b/egs/aishell/paraformer/local/download_and_untar.sh new file mode 100755 index 000000000..d98255915 --- /dev/null +++ b/egs/aishell/paraformer/local/download_and_untar.sh @@ -0,0 +1,105 @@ +#!/usr/bin/env bash + +# Copyright 2014 Johns Hopkins University (author: Daniel Povey) +# 2017 Xingyu Na +# Apache 2.0 + +remove_archive=false + +if [ "$1" == --remove-archive ]; then + remove_archive=true + shift +fi + +if [ $# -ne 3 ]; then + echo "Usage: $0 [--remove-archive] " + echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell" + echo "With --remove-archive it will remove the archive after successfully un-tarring it." + echo " can be one of: data_aishell, resource_aishell." +fi + +data=$1 +url=$2 +part=$3 + +if [ ! -d "$data" ]; then + echo "$0: no such directory $data" + exit 1; +fi + +part_ok=false +list="data_aishell resource_aishell" +for x in $list; do + if [ "$part" == $x ]; then part_ok=true; fi +done +if ! $part_ok; then + echo "$0: expected to be one of $list, but got '$part'" + exit 1; +fi + +if [ -z "$url" ]; then + echo "$0: empty URL base." + exit 1; +fi + +if [ -f $data/$part/.complete ]; then + echo "$0: data part $part was already successfully extracted, nothing to do." + exit 0; +fi + +# sizes of the archive files in bytes. +sizes="15582913665 1246920" + +if [ -f $data/$part.tgz ]; then + size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}') + size_ok=false + for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done + if ! $size_ok; then + echo "$0: removing existing file $data/$part.tgz because its size in bytes $size" + echo "does not equal the size of one of the archives." + rm $data/$part.tgz + else + echo "$data/$part.tgz exists and appears to be complete." + fi +fi + +if [ ! -f $data/$part.tgz ]; then + if ! command -v wget >/dev/null; then + echo "$0: wget is not installed." + exit 1; + fi + full_url=$url/$part.tgz + echo "$0: downloading data from $full_url. This may take some time, please be patient." + + cd $data || exit 1 + if ! wget --no-check-certificate $full_url; then + echo "$0: error executing wget $full_url" + exit 1; + fi +fi + +cd $data || exit 1 + +if ! tar -xvzf $part.tgz; then + echo "$0: error un-tarring archive $data/$part.tgz" + exit 1; +fi + +touch $data/$part/.complete + +if [ $part == "data_aishell" ]; then + cd $data/$part/wav || exit 1 + for wav in ./*.tar.gz; do + echo "Extracting wav from $wav" + tar -zxf $wav && rm $wav + done +fi + +echo "$0: Successfully downloaded and un-tarred $data/$part.tgz" + +if $remove_archive; then + echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied." + rm $data/$part.tgz +fi + +exit 0; diff --git a/egs/aishell/paraformer/local/prepare_data.sh b/egs/aishell/paraformer/local/prepare_data.sh deleted file mode 100755 index 77791f9c1..000000000 --- a/egs/aishell/paraformer/local/prepare_data.sh +++ /dev/null @@ -1,53 +0,0 @@ -#!/usr/bin/env bash -# Copyright 2018 AIShell-Foundation(Authors:Jiayu DU, Xingyu NA, Bengu WU, Hao ZHENG) -# 2018 Beijing Shell Shell Tech. Co. Ltd. (Author: Hui BU) -# Apache 2.0 - -# transform raw AISHELL-2 data to kaldi format - -. ./path.sh || exit 1; - -tmp= -dir= - -if [ $# != 3 ]; then - echo "Usage: $0 " - echo " $0 /export/AISHELL-2/iOS/train data/local/train data/train" - exit 1; -fi - -corpus=$1 -tmp=$2 -dir=$3 - -echo "prepare_data.sh: Preparing data in $corpus" - -mkdir -p $tmp -mkdir -p $dir - -# corpus check -if [ ! -d $corpus ] || [ ! -f $corpus/wav.scp ] || [ ! -f $corpus/trans.txt ]; then - echo "Error: $0 requires wav.scp and trans.txt under $corpus directory." - exit 1; -fi - -# validate utt-key list, IC0803W0380 is a bad utterance -awk '{print $1}' $corpus/wav.scp | grep -v 'IC0803W0380' > $tmp/wav_utt.list -awk '{print $1}' $corpus/trans.txt > $tmp/trans_utt.list -utils/filter_scp.pl -f 1 $tmp/wav_utt.list $tmp/trans_utt.list > $tmp/utt.list - -# wav.scp -awk -F'\t' -v path_prefix=$corpus '{printf("%s\t%s/%s\n",$1,path_prefix,$2)}' $corpus/wav.scp > $tmp/tmp_wav.scp -utils/filter_scp.pl -f 1 $tmp/utt.list $tmp/tmp_wav.scp | sort -k 1 | uniq > $tmp/wav.scp - -# text -utils/filter_scp.pl -f 1 $tmp/utt.list $corpus/trans.txt | sort -k 1 | uniq > $tmp/text - -# copy prepared resources from tmp_dir to target dir -mkdir -p $dir -for f in wav.scp text; do - cp $tmp/$f $dir/$f || exit 1; -done - -echo "local/prepare_data.sh succeeded" -exit 0; diff --git a/egs/aishell/paraformer/run.sh b/egs/aishell/paraformer/run.sh index 3556bd6be..0a69772fb 100755 --- a/egs/aishell/paraformer/run.sh +++ b/egs/aishell/paraformer/run.sh @@ -13,7 +13,7 @@ train_cmd=utils/run.pl infer_cmd=utils/run.pl # general configuration -feats_dir="/nfs/wangjiaming.wjm/Funasr_data_test/aishell" #feature output dictionary +feats_dir="../DATA" #feature output dictionary exp_dir="." lang=zh dumpdir=dump/fbank @@ -21,8 +21,8 @@ feats_type=fbank token_type=char scp=wav.scp type=sound -stage=3 -stop_stage=4 +stage=1 +stop_stage=1 # feature configuration feats_dim=80 @@ -31,7 +31,8 @@ nj=32 speed_perturb="0.9,1.0,1.1" # data -data_aishell= +raw_data= +data_url=www.openslr.org/resources/33 # exp tag tag="" @@ -66,10 +67,16 @@ else _ngpu=0 fi +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + echo "stage -1: Data Download" + local/download_and_untar.sh ${raw_data} ${data_url} data_aishell + local/download_and_untar.sh ${raw_data} ${data_url} resource_aishell +fi + if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then echo "stage 0: Data preparation" # Data preparation - local/aishell_data_prep.sh ${data_aishell}/data_aishell/wav ${data_aishell}/data_aishell/transcript ${feats_dir} + local/aishell_data_prep.sh ${raw_data}/data_aishell/wav ${raw_data}/data_aishell/transcript ${feats_dir} for x in train dev test; do cp ${feats_dir}/data/${x}/text ${feats_dir}/data/${x}/text.org paste -d " " <(cut -f 1 -d" " ${feats_dir}/data/${x}/text.org) <(cut -f 2- -d" " ${feats_dir}/data/${x}/text.org | tr -d " ") \ @@ -80,45 +87,10 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then fi feat_train_dir=${feats_dir}/${dumpdir}/train; mkdir -p ${feat_train_dir} -feat_dev_dir=${feats_dir}/${dumpdir}/dev; mkdir -p ${feat_dev_dir} -feat_test_dir=${feats_dir}/${dumpdir}/test; mkdir -p ${feat_test_dir} if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - echo "stage 1: Feature Generation" - # compute fbank features - fbankdir=${feats_dir}/fbank - utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} --speed_perturb ${speed_perturb} \ - ${feats_dir}/data/train ${exp_dir}/exp/make_fbank/train ${fbankdir}/train - utils/fix_data_feat.sh ${fbankdir}/train - utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} \ - ${feats_dir}/data/dev ${exp_dir}/exp/make_fbank/dev ${fbankdir}/dev - utils/fix_data_feat.sh ${fbankdir}/dev - utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} \ - ${feats_dir}/data/test ${exp_dir}/exp/make_fbank/test ${fbankdir}/test - utils/fix_data_feat.sh ${fbankdir}/test - - # compute global cmvn + echo "stage 1: Feature and CMVN Generation" utils/compute_cmvn.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} \ - ${fbankdir}/train ${exp_dir}/exp/make_fbank/train - - # apply cmvn - utils/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \ - ${fbankdir}/train ${fbankdir}/train/cmvn.json ${exp_dir}/exp/make_fbank/train ${feat_train_dir} - utils/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \ - ${fbankdir}/dev ${fbankdir}/train/cmvn.json ${exp_dir}/exp/make_fbank/dev ${feat_dev_dir} - utils/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \ - ${fbankdir}/test ${fbankdir}/train/cmvn.json ${exp_dir}/exp/make_fbank/test ${feat_test_dir} - - cp ${fbankdir}/train/text ${fbankdir}/train/speech_shape ${fbankdir}/train/text_shape ${feat_train_dir} - cp ${fbankdir}/dev/text ${fbankdir}/dev/speech_shape ${fbankdir}/dev/text_shape ${feat_dev_dir} - cp ${fbankdir}/test/text ${fbankdir}/test/speech_shape ${fbankdir}/test/text_shape ${feat_test_dir} - - utils/fix_data_feat.sh ${feat_train_dir} - utils/fix_data_feat.sh ${feat_dev_dir} - utils/fix_data_feat.sh ${feat_test_dir} - - #generate ark list - utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_train_dir} ${fbankdir}/train ${feat_train_dir} - utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_dev_dir} ${fbankdir}/dev ${feat_dev_dir} + ${feats_dir}/data/${train_set} ${exp_dir}/exp/make_fbank/${train_set} fi token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt @@ -136,12 +108,6 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then num_token=$(cat ${token_list} | wc -l) echo "" >> ${token_list} vocab_size=$(cat ${token_list} | wc -l) - awk -v v=,${vocab_size} '{print $0v}' ${feat_train_dir}/text_shape > ${feat_train_dir}/text_shape.char - awk -v v=,${vocab_size} '{print $0v}' ${feat_dev_dir}/text_shape > ${feat_dev_dir}/text_shape.char - mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/train - mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/dev - cp ${feat_train_dir}/speech_shape ${feat_train_dir}/text_shape ${feat_train_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/train - cp ${feat_dev_dir}/speech_shape ${feat_dev_dir}/text_shape ${feat_dev_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/dev fi # Training Stage diff --git a/egs/aishell/transformer/utils/compute_cmvn.sh b/egs/aishell/transformer/utils/compute_cmvn.sh index 12173ee96..dc0a69e02 100755 --- a/egs/aishell/transformer/utils/compute_cmvn.sh +++ b/egs/aishell/transformer/utils/compute_cmvn.sh @@ -13,13 +13,17 @@ echo "$0 $@" fbankdir=$1 logdir=$2 -output_dir=${fbankdir}/cmvn; mkdir -p ${output_dir} -mkdir -p ${logdir} +output_dir=${fbankdir}/cmvn/split_${nj}; +split_scps="" +for n in $(seq $nj); do + split_scps="$split_scps $output_dir/wav.$n.scp" +done +utils/split_scp.pl ${fbankdir}/wav.scp $split_scps || exit 1; -$cmd JOB=1:$nj $logdir/cmvn.JOB.log \ - python utils/compute_cmvn.py -d ${feats_dim} -a $fbankdir/ark -i JOB -o ${output_dir} \ - || exit 1; - -python utils/combine_cmvn_file.py -d ${feats_dim} -c ${output_dir} -n $nj -o $fbankdir - -echo "$0: Succeeded compute global cmvn" +#$cmd JOB=1:$nj $logdir/cmvn.JOB.log \ +# python utils/compute_cmvn.py -d ${feats_dim} -a $fbankdir/ark -i JOB -o ${output_dir} \ +# || exit 1; +# +#python utils/combine_cmvn_file.py -d ${feats_dim} -c ${output_dir} -n $nj -o $fbankdir +# +#echo "$0: Succeeded compute global cmvn"