update repo

This commit is contained in:
嘉渊 2023-05-25 17:02:24 +08:00
parent bade5bfca1
commit ca79f9c404
7 changed files with 215 additions and 70 deletions

View File

@ -0,0 +1,110 @@
#!/usr/bin/env bash
# Set bash to 'debug' mode, it will exit on :
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
set -e
set -u
set -o pipefail
log() {
local fname=${BASH_SOURCE[1]##*/}
echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
}
SECONDS=0
# general configuration
nj=10
stage=1
stop_stage=100
set=L
data_dir="data"
log "$0 $*"
. utils/parse_options.sh
. ./path.sh || exit 1;
. ./cmd.sh || exit 1;
. ./db.sh || exit 1;
if [ ! -e "${WENETSPEECH}" ]; then
log "Fill the value of 'WENETSPEECH' of db.sh"
log "or download the data set follwing the instruction in https://wenet-e2e.github.io/WenetSpeech/"
exit 1
fi
if [ ! -d "${WENETSPEECH}/audio" ] && [ ! -f "${WENETSPEECH}/WenetSpeech.json" ]; then
echo "Valid WENETSPEECH data not found in ${WENETSPEECH}."
echo "Please follow the instruction in https://wenet-e2e.github.io/WenetSpeech/"
echo "and re-construct the data."
exit 1
fi
train_set=train_"$(echo "${set}" | tr "[:upper:]" "[:lower:]")"
dev_set=dev
test_sets="test_net test_meeting"
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
log "data preparation"
mkdir -p ${data_dir}
abs_data_dir=$(readlink -f ${data_dir})
log "making Kaldi format data directory in ${abs_data_dir}"
local/wenetspeech_data_prep.sh \
--train-subset ${set} \
--stage 1 \
${WENETSPEECH} \
${abs_data_dir}
# prepare utt2spk and spk2utt files
for x in ${train_set} ${dev_set} ${test_sets}; do
dir=${data_dir}/${x}
paste -d " " <(cut -f 1 ${dir}/segments) <(cut -f 1 ${dir}/segments) | \
sort -u > ${dir}/utt2spk
utils/utt2spk_to_spk2utt.pl ${dir}/utt2spk > ${dir}/spk2utt
done
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
log "process the long term opus audio file, may take about 3 hours"
for x in ${train_set} ${dev_set} ${test_sets}; do
log "process audio for ${data_dir}/${x}"
dir=${data_dir}/${x}
mkdir -p ${dir}/logs
nutt=$(<${dir}/segments wc -l)
nj=$((nj<nutt?nj:nutt))
split_scps=""
for n in $(seq ${nj}); do
split_scps="${split_scps} ${dir}/logs/segments.${n}"
done
utils/split_scp.pl ${dir}/segments ${split_scps}
${train_cmd} "JOB=1:${nj}" "${dir}/logs/process_audio.JOB.log"\
python3 local/process_opus.py \
${dir}/wav.scp \
${dir}/logs/segments.JOB \
${dir}/logs/wav.JOB.scp
# modify the `wav.scp` file and rename the `segments` file
# rename the `segments` file to avoid the audio file formatting process in stage 3 of `asr.sh`
mv ${dir}/wav.scp ${dir}/wav.scp.org
mv ${dir}/segments ${dir}/segments.org
for n in $(seq ${nj}); do
cat ${dir}/logs/wav.${n}.scp || exit 1;
done | sort -u > ${dir}/wav.scp
done
fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
log "format text file"
for x in ${train_set} ${dev_set} ${test_sets}; do
log "format text for ${data_dir}/${x}"
dir=${data_dir}/${x}
mv ${dir}/text ${dir}/text.org
paste -d " " <(cut -f 1 ${dir}/text.org) \
<(cut -f 2- ${dir}/text.org | local/text_normalize.pl) | \
sort -u > ${dir}/text
utils/fix_data_dir.sh ${dir}
done
fi
log "Successfully finished. [elapsed=${SECONDS}s]"

View File

@ -13,20 +13,22 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os
import argparse
import json
import os
import sys
def get_args():
parser = argparse.ArgumentParser(description="""
parser = argparse.ArgumentParser(
description="""
This script is used to process raw json dataset of WenetSpeech,
where the long wav is splitinto segments and
data of wenet format is generated.
""")
parser.add_argument('input_json', help="""Input json file of WenetSpeech""")
parser.add_argument('output_dir', help="""Output dir for prepared data""")
"""
)
parser.add_argument("input_json", help="""Input json file of WenetSpeech""")
parser.add_argument("output_dir", help="""Output dir for prepared data""")
args = parser.parse_args()
return args
@ -39,58 +41,68 @@ def meta_analysis(input_json, output_dir):
os.makedirs(output_dir)
try:
with open(input_json, 'r') as injson:
with open(input_json, "r") as injson:
json_data = json.load(injson)
except Exception:
sys.exit(f'Failed to load input json file: {input_json}')
sys.exit(f"Failed to load input json file: {input_json}")
else:
if json_data['audios'] is not None:
with open(f'{output_dir}/text', 'w') as utt2text, \
open(f'{output_dir}/segments', 'w') as segments, \
open(f'{output_dir}/utt2dur', 'w') as utt2dur, \
open(f'{output_dir}/wav.scp', 'w') as wavscp, \
open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \
open(f'{output_dir}/reco2dur', 'w') as reco2dur:
for long_audio in json_data['audios']:
if json_data["audios"] is not None:
with open(f"{output_dir}/text", "w") as utt2text, open(
f"{output_dir}/segments", "w"
) as segments, open(f"{output_dir}/utt2dur", "w") as utt2dur, open(
f"{output_dir}/wav.scp", "w"
) as wavscp, open(
f"{output_dir}/utt2subsets", "w"
) as utt2subsets, open(
f"{output_dir}/reco2dur", "w"
) as reco2dur:
for long_audio in json_data["audios"]:
try:
long_audio_path = os.path.realpath(
os.path.join(input_dir, long_audio['path']))
aid = long_audio['aid']
segments_lists = long_audio['segments']
duration = long_audio['duration']
assert (os.path.exists(long_audio_path))
os.path.join(input_dir, long_audio["path"])
)
aid = long_audio["aid"]
segments_lists = long_audio["segments"]
duration = long_audio["duration"]
assert os.path.exists(long_audio_path)
except AssertionError:
print(f'''Warning: {aid} something is wrong,
maybe AssertionError, skipped''')
print(
f"""Warning: {aid} something is wrong,
maybe AssertionError, skipped"""
)
continue
except Exception:
print(f'''Warning: {aid} something is wrong, maybe the
error path: {long_audio_path}, skipped''')
print(
f"""Warning: {aid} something is wrong, maybe the
error path: {long_audio_path}, skipped"""
)
continue
else:
wavscp.write(f'{aid}\t{long_audio_path}\n')
reco2dur.write(f'{aid}\t{duration}\n')
wavscp.write(f"{aid}\t{long_audio_path}\n")
reco2dur.write(f"{aid}\t{duration}\n")
for segment_file in segments_lists:
try:
sid = segment_file['sid']
start_time = segment_file['begin_time']
end_time = segment_file['end_time']
sid = segment_file["sid"]
start_time = segment_file["begin_time"]
end_time = segment_file["end_time"]
dur = end_time - start_time
text = segment_file['text']
text = segment_file["text"]
segment_subsets = segment_file["subsets"]
except Exception:
print(f'''Warning: {segment_file} something
is wrong, skipped''')
print(
f"""Warning: {segment_file} something
is wrong, skipped"""
)
continue
else:
utt2text.write(f'{sid}\t{text}\n')
utt2text.write(f"{sid}\t{text}\n")
segments.write(
f'{sid}\t{aid}\t{start_time}\t{end_time}\n'
f"{sid}\t{aid}\t{start_time}\t{end_time}\n"
)
utt2dur.write(f'{sid}\t{dur}\n')
utt2dur.write(f"{sid}\t{dur}\n")
segment_sub_names = " ".join(segment_subsets)
utt2subsets.write(
f'{sid}\t{segment_sub_names}\n')
utt2subsets.write(f"{sid}\t{segment_sub_names}\n")
def main():
args = get_args()
@ -98,5 +110,5 @@ def main():
meta_analysis(args.input_json, args.output_dir)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

View File

@ -16,14 +16,15 @@
# usage: python3 process_opus.py wav.scp segments output_wav.scp
from pydub import AudioSegment
import sys
import os
import sys
from pydub import AudioSegment
def read_file(wav_scp, segments):
wav_scp_dict = {}
with open(wav_scp, 'r', encoding='UTF-8') as fin:
with open(wav_scp, "r", encoding="UTF-8") as fin:
for line_str in fin:
wav_id, path = line_str.strip().split()
wav_scp_dict[wav_id] = path
@ -32,7 +33,7 @@ def read_file(wav_scp, segments):
seg_path_list = []
start_time_list = []
end_time_list = []
with open(segments, 'r', encoding='UTF-8') as fin:
with open(segments, "r", encoding="UTF-8") as fin:
for line_str in fin:
arr = line_str.strip().split()
assert len(arr) == 4
@ -44,30 +45,27 @@ def read_file(wav_scp, segments):
# TODO(Qijie): Fix the process logic
def output(output_wav_scp, utt_list, seg_path_list, start_time_list,
end_time_list):
def output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list):
num_utts = len(utt_list)
step = int(num_utts * 0.01)
with open(output_wav_scp, 'w', encoding='UTF-8') as fout:
with open(output_wav_scp, "w", encoding="UTF-8") as fout:
previous_wav_path = ""
for i in range(num_utts):
utt_id = utt_list[i]
current_wav_path = seg_path_list[i]
output_dir = (os.path.dirname(current_wav_path)) \
.replace("audio", 'audio_seg')
seg_wav_path = os.path.join(output_dir, utt_id + '.wav')
# if not os.path.exists(output_dir):
# os.makedirs(output_dir)
output_dir = (os.path.dirname(current_wav_path)).replace(
"audio", "audio_seg"
)
seg_wav_path = os.path.join(output_dir, utt_id + ".wav")
os.makedirs(output_dir, exist_ok=True)
if current_wav_path != previous_wav_path:
source_wav = AudioSegment.from_file(current_wav_path)
previous_wav_path = current_wav_path
start = int(start_time_list[i] * 1000)
end = int(end_time_list[i] * 1000)
target_audio = source_wav[start:end].set_frame_rate(16000) \
.set_sample_width(2)
target_audio = source_wav[start:end].set_frame_rate(16000)
target_audio.export(seg_wav_path, format="wav")
fout.write("{} {}\n".format(utt_id, seg_wav_path))
@ -80,11 +78,11 @@ def main():
segments = sys.argv[2]
output_wav_scp = sys.argv[3]
utt_list, seg_path_list, start_time_list, end_time_list \
= read_file(wav_scp, segments)
output(output_wav_scp, utt_list, seg_path_list, start_time_list,
end_time_list)
utt_list, seg_path_list, start_time_list, end_time_list = read_file(
wav_scp, segments
)
output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list)
if __name__ == '__main__':
if __name__ == "__main__":
main()

View File

@ -0,0 +1,24 @@
#!/usr/bin/env perl
use utf8;
use open qw(:std :utf8);
use warnings;
while (<STDIN>) {
chomp;
# remove non UTF-8 whitespace character
if ($_ =~ / /) {$_ =~ s: ::g;}
if ($_ =~ / /) {$_ =~ s: ::g;}
# upper letters
if ($_ =~ /[a-zA-Z]/) {$_ =~ uc $_;}
# add "_" before and after each English word
if ($_ =~ /([A-Z]+)\s+([A-Z]+)/) {$_ =~ s/([A-Z]+)\s+([A-Z]+)/$1\_$2/g;}
if ($_ =~ /([A-Z]+)\s+([A-Z]+)/) {$_ =~ s/([A-Z]+)\s+([A-Z]+)/$1\_$2/g;}
if ($_ =~ m/([A-Z]+)(\p{Han}+)/) {$_ =~ s/([A-Z]+)(\p{Han}+)/$1\_$2/g;}
if ($_ =~ m/(\p{Han}+)([A-Z]+)/) {$_ =~ s/(\p{Han}+)([A-Z]+)/$1\_$2/g;}
# remove UTF-8 whitespace charcter
if ($_ =~ /\s+/) {$_ =~ s:\s+::g;}
# replace "_" with a normal whitespace
if ($_ =~ /\_/) {$_ =~ s:\_: :g;}
print "$_\n";
}

View File

@ -24,7 +24,7 @@ stage=1
prefix=
train_subset=L
. ./utils/parse_options.sh || exit 1;
. utils/parse_options.sh || exit 1;
filter_by_id () {
idlist=$1

View File

@ -41,6 +41,7 @@ set -e
set -u
set -o pipefail
set=L
train_set=train_l
valid_set=dev
test_sets="dev test_net test_meeting"
@ -71,15 +72,15 @@ fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "stage 0: Data preparation"
# Data preparation
local/wenetspeech_data_prep.sh $raw_data $feats_dir
mkdir $feats_dir/data
mv $feats_dir/$train_set $feats_dir/data/$train_set
for x in $test_sets; do
mv $feats_dir/$x $feats_dir/data/
done
local/data.sh "--set ${set}"
# mkdir $feats_dir/data
# mv $feats_dir/$train_set $feats_dir/data/$train_set
# for x in $test_sets; do
# mv $feats_dir/$x $feats_dir/data/
# done
fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "stage 1: Feature and CMVN Generation"
utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 0.1
fi
#if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# echo "stage 1: Feature and CMVN Generation"
# utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 0.1
#fi