diff --git a/egs/wenetspeech/conformer/local/data.sh b/egs/wenetspeech/conformer/local/data.sh new file mode 100755 index 000000000..dcfba5f58 --- /dev/null +++ b/egs/wenetspeech/conformer/local/data.sh @@ -0,0 +1,110 @@ +#!/usr/bin/env bash +# Set bash to 'debug' mode, it will exit on : +# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands', +set -e +set -u +set -o pipefail + +log() { + local fname=${BASH_SOURCE[1]##*/} + echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*" +} +SECONDS=0 + +# general configuration +nj=10 +stage=1 +stop_stage=100 +set=L +data_dir="data" + +log "$0 $*" +. utils/parse_options.sh + +. ./path.sh || exit 1; +. ./cmd.sh || exit 1; +. ./db.sh || exit 1; + +if [ ! -e "${WENETSPEECH}" ]; then + log "Fill the value of 'WENETSPEECH' of db.sh" + log "or download the data set follwing the instruction in https://wenet-e2e.github.io/WenetSpeech/" + exit 1 +fi + +if [ ! -d "${WENETSPEECH}/audio" ] && [ ! -f "${WENETSPEECH}/WenetSpeech.json" ]; then + echo "Valid WENETSPEECH data not found in ${WENETSPEECH}." + echo "Please follow the instruction in https://wenet-e2e.github.io/WenetSpeech/" + echo "and re-construct the data." + exit 1 +fi + +train_set=train_"$(echo "${set}" | tr "[:upper:]" "[:lower:]")" +dev_set=dev +test_sets="test_net test_meeting" + +if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then + log "data preparation" + mkdir -p ${data_dir} + abs_data_dir=$(readlink -f ${data_dir}) + log "making Kaldi format data directory in ${abs_data_dir}" + local/wenetspeech_data_prep.sh \ + --train-subset ${set} \ + --stage 1 \ + ${WENETSPEECH} \ + ${abs_data_dir} + + # prepare utt2spk and spk2utt files + for x in ${train_set} ${dev_set} ${test_sets}; do + dir=${data_dir}/${x} + paste -d " " <(cut -f 1 ${dir}/segments) <(cut -f 1 ${dir}/segments) | \ + sort -u > ${dir}/utt2spk + utils/utt2spk_to_spk2utt.pl ${dir}/utt2spk > ${dir}/spk2utt + done +fi + +if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then + log "process the long term opus audio file, may take about 3 hours" + for x in ${train_set} ${dev_set} ${test_sets}; do + log "process audio for ${data_dir}/${x}" + dir=${data_dir}/${x} + mkdir -p ${dir}/logs + + nutt=$(<${dir}/segments wc -l) + nj=$((nj ${dir}/wav.scp + done +fi + +if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then + log "format text file" + for x in ${train_set} ${dev_set} ${test_sets}; do + log "format text for ${data_dir}/${x}" + dir=${data_dir}/${x} + mv ${dir}/text ${dir}/text.org + paste -d " " <(cut -f 1 ${dir}/text.org) \ + <(cut -f 2- ${dir}/text.org | local/text_normalize.pl) | \ + sort -u > ${dir}/text + utils/fix_data_dir.sh ${dir} + done +fi + +log "Successfully finished. [elapsed=${SECONDS}s]" diff --git a/egs/wenetspeech/conformer/local/extract_meta.py b/egs/wenetspeech/conformer/local/extract_meta.py index ce2871d0b..607416203 100755 --- a/egs/wenetspeech/conformer/local/extract_meta.py +++ b/egs/wenetspeech/conformer/local/extract_meta.py @@ -13,20 +13,22 @@ # See the License for the specific language governing permissions and # limitations under the License. -import sys -import os import argparse import json +import os +import sys def get_args(): - parser = argparse.ArgumentParser(description=""" + parser = argparse.ArgumentParser( + description=""" This script is used to process raw json dataset of WenetSpeech, where the long wav is splitinto segments and data of wenet format is generated. - """) - parser.add_argument('input_json', help="""Input json file of WenetSpeech""") - parser.add_argument('output_dir', help="""Output dir for prepared data""") + """ + ) + parser.add_argument("input_json", help="""Input json file of WenetSpeech""") + parser.add_argument("output_dir", help="""Output dir for prepared data""") args = parser.parse_args() return args @@ -39,58 +41,68 @@ def meta_analysis(input_json, output_dir): os.makedirs(output_dir) try: - with open(input_json, 'r') as injson: + with open(input_json, "r") as injson: json_data = json.load(injson) except Exception: - sys.exit(f'Failed to load input json file: {input_json}') + sys.exit(f"Failed to load input json file: {input_json}") else: - if json_data['audios'] is not None: - with open(f'{output_dir}/text', 'w') as utt2text, \ - open(f'{output_dir}/segments', 'w') as segments, \ - open(f'{output_dir}/utt2dur', 'w') as utt2dur, \ - open(f'{output_dir}/wav.scp', 'w') as wavscp, \ - open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \ - open(f'{output_dir}/reco2dur', 'w') as reco2dur: - for long_audio in json_data['audios']: + if json_data["audios"] is not None: + with open(f"{output_dir}/text", "w") as utt2text, open( + f"{output_dir}/segments", "w" + ) as segments, open(f"{output_dir}/utt2dur", "w") as utt2dur, open( + f"{output_dir}/wav.scp", "w" + ) as wavscp, open( + f"{output_dir}/utt2subsets", "w" + ) as utt2subsets, open( + f"{output_dir}/reco2dur", "w" + ) as reco2dur: + for long_audio in json_data["audios"]: try: long_audio_path = os.path.realpath( - os.path.join(input_dir, long_audio['path'])) - aid = long_audio['aid'] - segments_lists = long_audio['segments'] - duration = long_audio['duration'] - assert (os.path.exists(long_audio_path)) + os.path.join(input_dir, long_audio["path"]) + ) + aid = long_audio["aid"] + segments_lists = long_audio["segments"] + duration = long_audio["duration"] + assert os.path.exists(long_audio_path) except AssertionError: - print(f'''Warning: {aid} something is wrong, - maybe AssertionError, skipped''') + print( + f"""Warning: {aid} something is wrong, + maybe AssertionError, skipped""" + ) continue except Exception: - print(f'''Warning: {aid} something is wrong, maybe the - error path: {long_audio_path}, skipped''') + print( + f"""Warning: {aid} something is wrong, maybe the + error path: {long_audio_path}, skipped""" + ) continue else: - wavscp.write(f'{aid}\t{long_audio_path}\n') - reco2dur.write(f'{aid}\t{duration}\n') + wavscp.write(f"{aid}\t{long_audio_path}\n") + reco2dur.write(f"{aid}\t{duration}\n") for segment_file in segments_lists: try: - sid = segment_file['sid'] - start_time = segment_file['begin_time'] - end_time = segment_file['end_time'] + sid = segment_file["sid"] + start_time = segment_file["begin_time"] + end_time = segment_file["end_time"] dur = end_time - start_time - text = segment_file['text'] + text = segment_file["text"] segment_subsets = segment_file["subsets"] except Exception: - print(f'''Warning: {segment_file} something - is wrong, skipped''') + print( + f"""Warning: {segment_file} something + is wrong, skipped""" + ) continue else: - utt2text.write(f'{sid}\t{text}\n') + utt2text.write(f"{sid}\t{text}\n") segments.write( - f'{sid}\t{aid}\t{start_time}\t{end_time}\n' + f"{sid}\t{aid}\t{start_time}\t{end_time}\n" ) - utt2dur.write(f'{sid}\t{dur}\n') + utt2dur.write(f"{sid}\t{dur}\n") segment_sub_names = " ".join(segment_subsets) - utt2subsets.write( - f'{sid}\t{segment_sub_names}\n') + utt2subsets.write(f"{sid}\t{segment_sub_names}\n") + def main(): args = get_args() @@ -98,5 +110,5 @@ def main(): meta_analysis(args.input_json, args.output_dir) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/egs/wenetspeech/conformer/local/path.sh b/egs/wenetspeech/conformer/local/path.sh new file mode 100644 index 000000000..e69de29bb diff --git a/egs/wenetspeech/conformer/local/process_opus.py b/egs/wenetspeech/conformer/local/process_opus.py index 9f71eb1a6..044d183b9 100755 --- a/egs/wenetspeech/conformer/local/process_opus.py +++ b/egs/wenetspeech/conformer/local/process_opus.py @@ -16,14 +16,15 @@ # usage: python3 process_opus.py wav.scp segments output_wav.scp -from pydub import AudioSegment -import sys import os +import sys + +from pydub import AudioSegment def read_file(wav_scp, segments): wav_scp_dict = {} - with open(wav_scp, 'r', encoding='UTF-8') as fin: + with open(wav_scp, "r", encoding="UTF-8") as fin: for line_str in fin: wav_id, path = line_str.strip().split() wav_scp_dict[wav_id] = path @@ -32,7 +33,7 @@ def read_file(wav_scp, segments): seg_path_list = [] start_time_list = [] end_time_list = [] - with open(segments, 'r', encoding='UTF-8') as fin: + with open(segments, "r", encoding="UTF-8") as fin: for line_str in fin: arr = line_str.strip().split() assert len(arr) == 4 @@ -44,30 +45,27 @@ def read_file(wav_scp, segments): # TODO(Qijie): Fix the process logic -def output(output_wav_scp, utt_list, seg_path_list, start_time_list, - end_time_list): +def output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list): num_utts = len(utt_list) step = int(num_utts * 0.01) - with open(output_wav_scp, 'w', encoding='UTF-8') as fout: + with open(output_wav_scp, "w", encoding="UTF-8") as fout: previous_wav_path = "" for i in range(num_utts): utt_id = utt_list[i] current_wav_path = seg_path_list[i] - output_dir = (os.path.dirname(current_wav_path)) \ - .replace("audio", 'audio_seg') - seg_wav_path = os.path.join(output_dir, utt_id + '.wav') - - # if not os.path.exists(output_dir): - # os.makedirs(output_dir) + output_dir = (os.path.dirname(current_wav_path)).replace( + "audio", "audio_seg" + ) + seg_wav_path = os.path.join(output_dir, utt_id + ".wav") + os.makedirs(output_dir, exist_ok=True) if current_wav_path != previous_wav_path: source_wav = AudioSegment.from_file(current_wav_path) previous_wav_path = current_wav_path start = int(start_time_list[i] * 1000) end = int(end_time_list[i] * 1000) - target_audio = source_wav[start:end].set_frame_rate(16000) \ - .set_sample_width(2) + target_audio = source_wav[start:end].set_frame_rate(16000) target_audio.export(seg_wav_path, format="wav") fout.write("{} {}\n".format(utt_id, seg_wav_path)) @@ -80,11 +78,11 @@ def main(): segments = sys.argv[2] output_wav_scp = sys.argv[3] - utt_list, seg_path_list, start_time_list, end_time_list \ - = read_file(wav_scp, segments) - output(output_wav_scp, utt_list, seg_path_list, start_time_list, - end_time_list) + utt_list, seg_path_list, start_time_list, end_time_list = read_file( + wav_scp, segments + ) + output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list) -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/egs/wenetspeech/conformer/local/text_normalize.pl b/egs/wenetspeech/conformer/local/text_normalize.pl new file mode 100755 index 000000000..55b35e210 --- /dev/null +++ b/egs/wenetspeech/conformer/local/text_normalize.pl @@ -0,0 +1,24 @@ +#!/usr/bin/env perl +use utf8; +use open qw(:std :utf8); +use warnings; + +while () { + chomp; + # remove non UTF-8 whitespace character + if ($_ =~ / /) {$_ =~ s: ::g;} + if ($_ =~ / /) {$_ =~ s: ::g;} + # upper letters + if ($_ =~ /[a-zA-Z]/) {$_ =~ uc $_;} + # add "_" before and after each English word + if ($_ =~ /([A-Z]+)\s+([A-Z]+)/) {$_ =~ s/([A-Z]+)\s+([A-Z]+)/$1\_$2/g;} + if ($_ =~ /([A-Z]+)\s+([A-Z]+)/) {$_ =~ s/([A-Z]+)\s+([A-Z]+)/$1\_$2/g;} + if ($_ =~ m/([A-Z]+)(\p{Han}+)/) {$_ =~ s/([A-Z]+)(\p{Han}+)/$1\_$2/g;} + if ($_ =~ m/(\p{Han}+)([A-Z]+)/) {$_ =~ s/(\p{Han}+)([A-Z]+)/$1\_$2/g;} + # remove UTF-8 whitespace charcter + if ($_ =~ /\s+/) {$_ =~ s:\s+::g;} + # replace "_" with a normal whitespace + if ($_ =~ /\_/) {$_ =~ s:\_: :g;} + + print "$_\n"; +} diff --git a/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh b/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh index baa2b32df..4959328b8 100755 --- a/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh +++ b/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh @@ -24,7 +24,7 @@ stage=1 prefix= train_subset=L -. ./utils/parse_options.sh || exit 1; +. utils/parse_options.sh || exit 1; filter_by_id () { idlist=$1 diff --git a/egs/wenetspeech/conformer/run.sh b/egs/wenetspeech/conformer/run.sh index 006c0b992..2ccafd770 100644 --- a/egs/wenetspeech/conformer/run.sh +++ b/egs/wenetspeech/conformer/run.sh @@ -41,6 +41,7 @@ set -e set -u set -o pipefail +set=L train_set=train_l valid_set=dev test_sets="dev test_net test_meeting" @@ -71,15 +72,15 @@ fi if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then echo "stage 0: Data preparation" # Data preparation - local/wenetspeech_data_prep.sh $raw_data $feats_dir - mkdir $feats_dir/data - mv $feats_dir/$train_set $feats_dir/data/$train_set - for x in $test_sets; do - mv $feats_dir/$x $feats_dir/data/ - done + local/data.sh "--set ${set}" +# mkdir $feats_dir/data +# mv $feats_dir/$train_set $feats_dir/data/$train_set +# for x in $test_sets; do +# mv $feats_dir/$x $feats_dir/data/ +# done fi -if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then - echo "stage 1: Feature and CMVN Generation" - utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 0.1 -fi +#if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then +# echo "stage 1: Feature and CMVN Generation" +# utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 0.1 +#fi