update repo

This commit is contained in:
嘉渊 2023-05-25 17:02:24 +08:00
parent bade5bfca1
commit ca79f9c404
7 changed files with 215 additions and 70 deletions

View File

@ -0,0 +1,110 @@
#!/usr/bin/env bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
log() {
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
SECONDS=0
# general configuration
nj=10
stage=1
stop_stage=100
set=L
data_dir="data"
log "$0 $*"
. utils/parse_options.sh
. ./path.sh || exit 1;
. ./cmd.sh || exit 1;
. ./db.sh || exit 1;
if [ ! -e "${WENETSPEECH}" ]; then
log "Fill the value of 'WENETSPEECH' of db.sh"
log "or download the data set follwing the instruction in https://wenet-e2e.github.io/WenetSpeech/"
exit 1
fi
if [ ! -d "${WENETSPEECH}/audio" ] && [ ! -f "${WENETSPEECH}/WenetSpeech.json" ]; then
echo "Valid WENETSPEECH data not found in ${WENETSPEECH}."
echo "Please follow the instruction in https://wenet-e2e.github.io/WenetSpeech/"
echo "and re-construct the data."
exit 1
fi
train_set=train_"$(echo "${set}" | tr "[:upper:]" "[:lower:]")"
dev_set=dev
test_sets="test_net test_meeting"
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
log "data preparation"
mkdir -p ${data_dir}
abs_data_dir=$(readlink -f ${data_dir})
log "making Kaldi format data directory in ${abs_data_dir}"
local/wenetspeech_data_prep.sh \
--train-subset ${set} \
--stage 1 \
${WENETSPEECH} \
${abs_data_dir}
# prepare utt2spk and spk2utt files
for x in ${train_set} ${dev_set} ${test_sets}; do
dir=${data_dir}/${x}
paste -d " " <(cut -f 1 ${dir}/segments) <(cut -f 1 ${dir}/segments) | \
sort -u > ${dir}/utt2spk
utils/utt2spk_to_spk2utt.pl ${dir}/utt2spk > ${dir}/spk2utt
done
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
log "process the long term opus audio file, may take about 3 hours"
for x in ${train_set} ${dev_set} ${test_sets}; do
log "process audio for ${data_dir}/${x}"
dir=${data_dir}/${x}
mkdir -p ${dir}/logs
nutt=$(<${dir}/segments wc -l)
nj=$((nj<nutt?nj:nutt))
split_scps=""
for n in $(seq ${nj}); do
split_scps="${split_scps} ${dir}/logs/segments.${n}"
done
utils/split_scp.pl ${dir}/segments ${split_scps}
${train_cmd} "JOB=1:${nj}" "${dir}/logs/process_audio.JOB.log"\
python3 local/process_opus.py \
${dir}/wav.scp \
${dir}/logs/segments.JOB \
${dir}/logs/wav.JOB.scp
# modify the `wav.scp` file and rename the `segments` file
# rename the `segments` file to avoid the audio file formatting process in stage 3 of `asr.sh`
mv ${dir}/wav.scp ${dir}/wav.scp.org
mv ${dir}/segments ${dir}/segments.org
for n in $(seq ${nj}); do
cat ${dir}/logs/wav.${n}.scp || exit 1;
done | sort -u > ${dir}/wav.scp
done
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
log "format text file"
for x in ${train_set} ${dev_set} ${test_sets}; do
log "format text for ${data_dir}/${x}"
dir=${data_dir}/${x}
mv ${dir}/text ${dir}/text.org
paste -d " " <(cut -f 1 ${dir}/text.org) \
<(cut -f 2- ${dir}/text.org | local/text_normalize.pl) | \
sort -u > ${dir}/text
utils/fix_data_dir.sh ${dir}
done
fi
log "Successfully finished. [elapsed=${SECONDS}s]"

View File

@ -13,20 +13,22 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import sys
import os
import argparse import argparse
import json import json
import os
import sys
def get_args(): def get_args():
parser = argparse.ArgumentParser(description=""" parser = argparse.ArgumentParser(
description="""
This script is used to process raw json dataset of WenetSpeech, This script is used to process raw json dataset of WenetSpeech,
where the long wav is splitinto segments and where the long wav is splitinto segments and
data of wenet format is generated. data of wenet format is generated.
""") """
parser.add_argument('input_json', help="""Input json file of WenetSpeech""") )
parser.add_argument('output_dir', help="""Output dir for prepared data""") parser.add_argument("input_json", help="""Input json file of WenetSpeech""")
parser.add_argument("output_dir", help="""Output dir for prepared data""")
args = parser.parse_args() args = parser.parse_args()
return args return args
@ -39,58 +41,68 @@ def meta_analysis(input_json, output_dir):
os.makedirs(output_dir) os.makedirs(output_dir)
try: try:
with open(input_json, 'r') as injson: with open(input_json, "r") as injson:
json_data = json.load(injson) json_data = json.load(injson)
except Exception: except Exception:
sys.exit(f'Failed to load input json file: {input_json}') sys.exit(f"Failed to load input json file: {input_json}")
else: else:
if json_data['audios'] is not None: if json_data["audios"] is not None:
with open(f'{output_dir}/text', 'w') as utt2text, \ with open(f"{output_dir}/text", "w") as utt2text, open(
open(f'{output_dir}/segments', 'w') as segments, \ f"{output_dir}/segments", "w"
open(f'{output_dir}/utt2dur', 'w') as utt2dur, \ ) as segments, open(f"{output_dir}/utt2dur", "w") as utt2dur, open(
open(f'{output_dir}/wav.scp', 'w') as wavscp, \ f"{output_dir}/wav.scp", "w"
open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \ ) as wavscp, open(
open(f'{output_dir}/reco2dur', 'w') as reco2dur: f"{output_dir}/utt2subsets", "w"
for long_audio in json_data['audios']: ) as utt2subsets, open(
f"{output_dir}/reco2dur", "w"
) as reco2dur:
for long_audio in json_data["audios"]:
try: try:
long_audio_path = os.path.realpath( long_audio_path = os.path.realpath(
os.path.join(input_dir, long_audio['path'])) os.path.join(input_dir, long_audio["path"])
aid = long_audio['aid'] )
segments_lists = long_audio['segments'] aid = long_audio["aid"]
duration = long_audio['duration'] segments_lists = long_audio["segments"]
assert (os.path.exists(long_audio_path)) duration = long_audio["duration"]
assert os.path.exists(long_audio_path)
except AssertionError: except AssertionError:
print(f'''Warning: {aid} something is wrong, print(
maybe AssertionError, skipped''') f"""Warning: {aid} something is wrong,
maybe AssertionError, skipped"""
)
continue continue
except Exception: except Exception:
print(f'''Warning: {aid} something is wrong, maybe the print(
error path: {long_audio_path}, skipped''') f"""Warning: {aid} something is wrong, maybe the
error path: {long_audio_path}, skipped"""
)
continue continue
else: else:
wavscp.write(f'{aid}\t{long_audio_path}\n') wavscp.write(f"{aid}\t{long_audio_path}\n")
reco2dur.write(f'{aid}\t{duration}\n') reco2dur.write(f"{aid}\t{duration}\n")
for segment_file in segments_lists: for segment_file in segments_lists:
try: try:
sid = segment_file['sid'] sid = segment_file["sid"]
start_time = segment_file['begin_time'] start_time = segment_file["begin_time"]
end_time = segment_file['end_time'] end_time = segment_file["end_time"]
dur = end_time - start_time dur = end_time - start_time
text = segment_file['text'] text = segment_file["text"]
segment_subsets = segment_file["subsets"] segment_subsets = segment_file["subsets"]
except Exception: except Exception:
print(f'''Warning: {segment_file} something print(
is wrong, skipped''') f"""Warning: {segment_file} something
is wrong, skipped"""
)
continue continue
else: else:
utt2text.write(f'{sid}\t{text}\n') utt2text.write(f"{sid}\t{text}\n")
segments.write( segments.write(
f'{sid}\t{aid}\t{start_time}\t{end_time}\n' f"{sid}\t{aid}\t{start_time}\t{end_time}\n"
) )
utt2dur.write(f'{sid}\t{dur}\n') utt2dur.write(f"{sid}\t{dur}\n")
segment_sub_names = " ".join(segment_subsets) segment_sub_names = " ".join(segment_subsets)
utt2subsets.write( utt2subsets.write(f"{sid}\t{segment_sub_names}\n")
f'{sid}\t{segment_sub_names}\n')
def main(): def main():
args = get_args() args = get_args()
@ -98,5 +110,5 @@ def main():
meta_analysis(args.input_json, args.output_dir) meta_analysis(args.input_json, args.output_dir)
if __name__ == '__main__': if __name__ == "__main__":
main() main()

View File

View File

@ -16,14 +16,15 @@
# usage: python3 process_opus.py wav.scp segments output_wav.scp # usage: python3 process_opus.py wav.scp segments output_wav.scp
from pydub import AudioSegment
import sys
import os import os
import sys
from pydub import AudioSegment
def read_file(wav_scp, segments): def read_file(wav_scp, segments):
wav_scp_dict = {} wav_scp_dict = {}
with open(wav_scp, 'r', encoding='UTF-8') as fin: with open(wav_scp, "r", encoding="UTF-8") as fin:
for line_str in fin: for line_str in fin:
wav_id, path = line_str.strip().split() wav_id, path = line_str.strip().split()
wav_scp_dict[wav_id] = path wav_scp_dict[wav_id] = path
@ -32,7 +33,7 @@ def read_file(wav_scp, segments):
seg_path_list = [] seg_path_list = []
start_time_list = [] start_time_list = []
end_time_list = [] end_time_list = []
with open(segments, 'r', encoding='UTF-8') as fin: with open(segments, "r", encoding="UTF-8") as fin:
for line_str in fin: for line_str in fin:
arr = line_str.strip().split() arr = line_str.strip().split()
assert len(arr) == 4 assert len(arr) == 4
@ -44,30 +45,27 @@ def read_file(wav_scp, segments):
# TODO(Qijie): Fix the process logic # TODO(Qijie): Fix the process logic
def output(output_wav_scp, utt_list, seg_path_list, start_time_list, def output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list):
end_time_list):
num_utts = len(utt_list) num_utts = len(utt_list)
step = int(num_utts * 0.01) step = int(num_utts * 0.01)
with open(output_wav_scp, 'w', encoding='UTF-8') as fout: with open(output_wav_scp, "w", encoding="UTF-8") as fout:
previous_wav_path = "" previous_wav_path = ""
for i in range(num_utts): for i in range(num_utts):
utt_id = utt_list[i] utt_id = utt_list[i]
current_wav_path = seg_path_list[i] current_wav_path = seg_path_list[i]
output_dir = (os.path.dirname(current_wav_path)) \ output_dir = (os.path.dirname(current_wav_path)).replace(
.replace("audio", 'audio_seg') "audio", "audio_seg"
seg_wav_path = os.path.join(output_dir, utt_id + '.wav') )
seg_wav_path = os.path.join(output_dir, utt_id + ".wav")
# if not os.path.exists(output_dir):
# os.makedirs(output_dir)
os.makedirs(output_dir, exist_ok=True)
if current_wav_path != previous_wav_path: if current_wav_path != previous_wav_path:
source_wav = AudioSegment.from_file(current_wav_path) source_wav = AudioSegment.from_file(current_wav_path)
previous_wav_path = current_wav_path previous_wav_path = current_wav_path
start = int(start_time_list[i] * 1000) start = int(start_time_list[i] * 1000)
end = int(end_time_list[i] * 1000) end = int(end_time_list[i] * 1000)
target_audio = source_wav[start:end].set_frame_rate(16000) \ target_audio = source_wav[start:end].set_frame_rate(16000)
.set_sample_width(2)
target_audio.export(seg_wav_path, format="wav") target_audio.export(seg_wav_path, format="wav")
fout.write("{} {}\n".format(utt_id, seg_wav_path)) fout.write("{} {}\n".format(utt_id, seg_wav_path))
@ -80,11 +78,11 @@ def main():
segments = sys.argv[2] segments = sys.argv[2]
output_wav_scp = sys.argv[3] output_wav_scp = sys.argv[3]
utt_list, seg_path_list, start_time_list, end_time_list \ utt_list, seg_path_list, start_time_list, end_time_list = read_file(
= read_file(wav_scp, segments) wav_scp, segments
output(output_wav_scp, utt_list, seg_path_list, start_time_list, )
end_time_list) output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list)
if __name__ == '__main__': if __name__ == "__main__":
main() main()

View File

@ -0,0 +1,24 @@
#!/usr/bin/env perl
use utf8;
use open qw(:std :utf8);
use warnings;
while (<STDIN>) {
chomp;
# remove non UTF-8 whitespace character
if ($_ =~ / /) {$_ =~ s: ::g;}
if ($_ =~ / /) {$_ =~ s: ::g;}
# upper letters
if ($_ =~ /[a-zA-Z]/) {$_ =~ uc $_;}
# add "_" before and after each English word
if ($_ =~ /([A-Z]+)\s+([A-Z]+)/) {$_ =~ s/([A-Z]+)\s+([A-Z]+)/$1\_$2/g;}
if ($_ =~ /([A-Z]+)\s+([A-Z]+)/) {$_ =~ s/([A-Z]+)\s+([A-Z]+)/$1\_$2/g;}
if ($_ =~ m/([A-Z]+)(\p{Han}+)/) {$_ =~ s/([A-Z]+)(\p{Han}+)/$1\_$2/g;}
if ($_ =~ m/(\p{Han}+)([A-Z]+)/) {$_ =~ s/(\p{Han}+)([A-Z]+)/$1\_$2/g;}
# remove UTF-8 whitespace charcter
if ($_ =~ /\s+/) {$_ =~ s:\s+::g;}
# replace "_" with a normal whitespace
if ($_ =~ /\_/) {$_ =~ s:\_: :g;}
print "$_\n";
}

View File

@ -24,7 +24,7 @@ stage=1
prefix= prefix=
train_subset=L train_subset=L
. ./utils/parse_options.sh || exit 1; . utils/parse_options.sh || exit 1;
filter_by_id () { filter_by_id () {
idlist=$1 idlist=$1

View File

@ -41,6 +41,7 @@ set -e
set -u set -u
set -o pipefail set -o pipefail
set=L
train_set=train_l train_set=train_l
valid_set=dev valid_set=dev
test_sets="dev test_net test_meeting" test_sets="dev test_net test_meeting"
@ -71,15 +72,15 @@ fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "stage 0: Data preparation" echo "stage 0: Data preparation"
# Data preparation # Data preparation
local/wenetspeech_data_prep.sh $raw_data $feats_dir local/data.sh "--set ${set}"
mkdir $feats_dir/data # mkdir $feats_dir/data
mv $feats_dir/$train_set $feats_dir/data/$train_set # mv $feats_dir/$train_set $feats_dir/data/$train_set
for x in $test_sets; do # for x in $test_sets; do
mv $feats_dir/$x $feats_dir/data/ # mv $feats_dir/$x $feats_dir/data/
done # done
fi fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then #if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: Feature and CMVN Generation" # echo "stage 1: Feature and CMVN Generation"
utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 0.1 # utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 0.1
fi #fi