mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
update repo
This commit is contained in:
parent
bade5bfca1
commit
ca79f9c404
110
egs/wenetspeech/conformer/local/data.sh
Executable file
110
egs/wenetspeech/conformer/local/data.sh
Executable file
@ -0,0 +1,110 @@
|
||||
#!/usr/bin/env bash
|
||||
# Set bash to 'debug' mode, it will exit on :
|
||||
# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
|
||||
set -e
|
||||
set -u
|
||||
set -o pipefail
|
||||
|
||||
log() {
|
||||
local fname=${BASH_SOURCE[1]##*/}
|
||||
echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
|
||||
}
|
||||
SECONDS=0
|
||||
|
||||
# general configuration
|
||||
nj=10
|
||||
stage=1
|
||||
stop_stage=100
|
||||
set=L
|
||||
data_dir="data"
|
||||
|
||||
log "$0 $*"
|
||||
. utils/parse_options.sh
|
||||
|
||||
. ./path.sh || exit 1;
|
||||
. ./cmd.sh || exit 1;
|
||||
. ./db.sh || exit 1;
|
||||
|
||||
if [ ! -e "${WENETSPEECH}" ]; then
|
||||
log "Fill the value of 'WENETSPEECH' of db.sh"
|
||||
log "or download the data set follwing the instruction in https://wenet-e2e.github.io/WenetSpeech/"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -d "${WENETSPEECH}/audio" ] && [ ! -f "${WENETSPEECH}/WenetSpeech.json" ]; then
|
||||
echo "Valid WENETSPEECH data not found in ${WENETSPEECH}."
|
||||
echo "Please follow the instruction in https://wenet-e2e.github.io/WenetSpeech/"
|
||||
echo "and re-construct the data."
|
||||
exit 1
|
||||
fi
|
||||
|
||||
train_set=train_"$(echo "${set}" | tr "[:upper:]" "[:lower:]")"
|
||||
dev_set=dev
|
||||
test_sets="test_net test_meeting"
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
log "data preparation"
|
||||
mkdir -p ${data_dir}
|
||||
abs_data_dir=$(readlink -f ${data_dir})
|
||||
log "making Kaldi format data directory in ${abs_data_dir}"
|
||||
local/wenetspeech_data_prep.sh \
|
||||
--train-subset ${set} \
|
||||
--stage 1 \
|
||||
${WENETSPEECH} \
|
||||
${abs_data_dir}
|
||||
|
||||
# prepare utt2spk and spk2utt files
|
||||
for x in ${train_set} ${dev_set} ${test_sets}; do
|
||||
dir=${data_dir}/${x}
|
||||
paste -d " " <(cut -f 1 ${dir}/segments) <(cut -f 1 ${dir}/segments) | \
|
||||
sort -u > ${dir}/utt2spk
|
||||
utils/utt2spk_to_spk2utt.pl ${dir}/utt2spk > ${dir}/spk2utt
|
||||
done
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
|
||||
log "process the long term opus audio file, may take about 3 hours"
|
||||
for x in ${train_set} ${dev_set} ${test_sets}; do
|
||||
log "process audio for ${data_dir}/${x}"
|
||||
dir=${data_dir}/${x}
|
||||
mkdir -p ${dir}/logs
|
||||
|
||||
nutt=$(<${dir}/segments wc -l)
|
||||
nj=$((nj<nutt?nj:nutt))
|
||||
|
||||
split_scps=""
|
||||
for n in $(seq ${nj}); do
|
||||
split_scps="${split_scps} ${dir}/logs/segments.${n}"
|
||||
done
|
||||
utils/split_scp.pl ${dir}/segments ${split_scps}
|
||||
|
||||
${train_cmd} "JOB=1:${nj}" "${dir}/logs/process_audio.JOB.log"\
|
||||
python3 local/process_opus.py \
|
||||
${dir}/wav.scp \
|
||||
${dir}/logs/segments.JOB \
|
||||
${dir}/logs/wav.JOB.scp
|
||||
|
||||
# modify the `wav.scp` file and rename the `segments` file
|
||||
# rename the `segments` file to avoid the audio file formatting process in stage 3 of `asr.sh`
|
||||
mv ${dir}/wav.scp ${dir}/wav.scp.org
|
||||
mv ${dir}/segments ${dir}/segments.org
|
||||
for n in $(seq ${nj}); do
|
||||
cat ${dir}/logs/wav.${n}.scp || exit 1;
|
||||
done | sort -u > ${dir}/wav.scp
|
||||
done
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
|
||||
log "format text file"
|
||||
for x in ${train_set} ${dev_set} ${test_sets}; do
|
||||
log "format text for ${data_dir}/${x}"
|
||||
dir=${data_dir}/${x}
|
||||
mv ${dir}/text ${dir}/text.org
|
||||
paste -d " " <(cut -f 1 ${dir}/text.org) \
|
||||
<(cut -f 2- ${dir}/text.org | local/text_normalize.pl) | \
|
||||
sort -u > ${dir}/text
|
||||
utils/fix_data_dir.sh ${dir}
|
||||
done
|
||||
fi
|
||||
|
||||
log "Successfully finished. [elapsed=${SECONDS}s]"
|
||||
@ -13,20 +13,22 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
def get_args():
|
||||
parser = argparse.ArgumentParser(description="""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="""
|
||||
This script is used to process raw json dataset of WenetSpeech,
|
||||
where the long wav is splitinto segments and
|
||||
data of wenet format is generated.
|
||||
""")
|
||||
parser.add_argument('input_json', help="""Input json file of WenetSpeech""")
|
||||
parser.add_argument('output_dir', help="""Output dir for prepared data""")
|
||||
"""
|
||||
)
|
||||
parser.add_argument("input_json", help="""Input json file of WenetSpeech""")
|
||||
parser.add_argument("output_dir", help="""Output dir for prepared data""")
|
||||
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
@ -39,58 +41,68 @@ def meta_analysis(input_json, output_dir):
|
||||
os.makedirs(output_dir)
|
||||
|
||||
try:
|
||||
with open(input_json, 'r') as injson:
|
||||
with open(input_json, "r") as injson:
|
||||
json_data = json.load(injson)
|
||||
except Exception:
|
||||
sys.exit(f'Failed to load input json file: {input_json}')
|
||||
sys.exit(f"Failed to load input json file: {input_json}")
|
||||
else:
|
||||
if json_data['audios'] is not None:
|
||||
with open(f'{output_dir}/text', 'w') as utt2text, \
|
||||
open(f'{output_dir}/segments', 'w') as segments, \
|
||||
open(f'{output_dir}/utt2dur', 'w') as utt2dur, \
|
||||
open(f'{output_dir}/wav.scp', 'w') as wavscp, \
|
||||
open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \
|
||||
open(f'{output_dir}/reco2dur', 'w') as reco2dur:
|
||||
for long_audio in json_data['audios']:
|
||||
if json_data["audios"] is not None:
|
||||
with open(f"{output_dir}/text", "w") as utt2text, open(
|
||||
f"{output_dir}/segments", "w"
|
||||
) as segments, open(f"{output_dir}/utt2dur", "w") as utt2dur, open(
|
||||
f"{output_dir}/wav.scp", "w"
|
||||
) as wavscp, open(
|
||||
f"{output_dir}/utt2subsets", "w"
|
||||
) as utt2subsets, open(
|
||||
f"{output_dir}/reco2dur", "w"
|
||||
) as reco2dur:
|
||||
for long_audio in json_data["audios"]:
|
||||
try:
|
||||
long_audio_path = os.path.realpath(
|
||||
os.path.join(input_dir, long_audio['path']))
|
||||
aid = long_audio['aid']
|
||||
segments_lists = long_audio['segments']
|
||||
duration = long_audio['duration']
|
||||
assert (os.path.exists(long_audio_path))
|
||||
os.path.join(input_dir, long_audio["path"])
|
||||
)
|
||||
aid = long_audio["aid"]
|
||||
segments_lists = long_audio["segments"]
|
||||
duration = long_audio["duration"]
|
||||
assert os.path.exists(long_audio_path)
|
||||
except AssertionError:
|
||||
print(f'''Warning: {aid} something is wrong,
|
||||
maybe AssertionError, skipped''')
|
||||
print(
|
||||
f"""Warning: {aid} something is wrong,
|
||||
maybe AssertionError, skipped"""
|
||||
)
|
||||
continue
|
||||
except Exception:
|
||||
print(f'''Warning: {aid} something is wrong, maybe the
|
||||
error path: {long_audio_path}, skipped''')
|
||||
print(
|
||||
f"""Warning: {aid} something is wrong, maybe the
|
||||
error path: {long_audio_path}, skipped"""
|
||||
)
|
||||
continue
|
||||
else:
|
||||
wavscp.write(f'{aid}\t{long_audio_path}\n')
|
||||
reco2dur.write(f'{aid}\t{duration}\n')
|
||||
wavscp.write(f"{aid}\t{long_audio_path}\n")
|
||||
reco2dur.write(f"{aid}\t{duration}\n")
|
||||
for segment_file in segments_lists:
|
||||
try:
|
||||
sid = segment_file['sid']
|
||||
start_time = segment_file['begin_time']
|
||||
end_time = segment_file['end_time']
|
||||
sid = segment_file["sid"]
|
||||
start_time = segment_file["begin_time"]
|
||||
end_time = segment_file["end_time"]
|
||||
dur = end_time - start_time
|
||||
text = segment_file['text']
|
||||
text = segment_file["text"]
|
||||
segment_subsets = segment_file["subsets"]
|
||||
except Exception:
|
||||
print(f'''Warning: {segment_file} something
|
||||
is wrong, skipped''')
|
||||
print(
|
||||
f"""Warning: {segment_file} something
|
||||
is wrong, skipped"""
|
||||
)
|
||||
continue
|
||||
else:
|
||||
utt2text.write(f'{sid}\t{text}\n')
|
||||
utt2text.write(f"{sid}\t{text}\n")
|
||||
segments.write(
|
||||
f'{sid}\t{aid}\t{start_time}\t{end_time}\n'
|
||||
f"{sid}\t{aid}\t{start_time}\t{end_time}\n"
|
||||
)
|
||||
utt2dur.write(f'{sid}\t{dur}\n')
|
||||
utt2dur.write(f"{sid}\t{dur}\n")
|
||||
segment_sub_names = " ".join(segment_subsets)
|
||||
utt2subsets.write(
|
||||
f'{sid}\t{segment_sub_names}\n')
|
||||
utt2subsets.write(f"{sid}\t{segment_sub_names}\n")
|
||||
|
||||
|
||||
def main():
|
||||
args = get_args()
|
||||
@ -98,5 +110,5 @@ def main():
|
||||
meta_analysis(args.input_json, args.output_dir)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
0
egs/wenetspeech/conformer/local/path.sh
Normal file
0
egs/wenetspeech/conformer/local/path.sh
Normal file
@ -16,14 +16,15 @@
|
||||
|
||||
# usage: python3 process_opus.py wav.scp segments output_wav.scp
|
||||
|
||||
from pydub import AudioSegment
|
||||
import sys
|
||||
import os
|
||||
import sys
|
||||
|
||||
from pydub import AudioSegment
|
||||
|
||||
|
||||
def read_file(wav_scp, segments):
|
||||
wav_scp_dict = {}
|
||||
with open(wav_scp, 'r', encoding='UTF-8') as fin:
|
||||
with open(wav_scp, "r", encoding="UTF-8") as fin:
|
||||
for line_str in fin:
|
||||
wav_id, path = line_str.strip().split()
|
||||
wav_scp_dict[wav_id] = path
|
||||
@ -32,7 +33,7 @@ def read_file(wav_scp, segments):
|
||||
seg_path_list = []
|
||||
start_time_list = []
|
||||
end_time_list = []
|
||||
with open(segments, 'r', encoding='UTF-8') as fin:
|
||||
with open(segments, "r", encoding="UTF-8") as fin:
|
||||
for line_str in fin:
|
||||
arr = line_str.strip().split()
|
||||
assert len(arr) == 4
|
||||
@ -44,30 +45,27 @@ def read_file(wav_scp, segments):
|
||||
|
||||
|
||||
# TODO(Qijie): Fix the process logic
|
||||
def output(output_wav_scp, utt_list, seg_path_list, start_time_list,
|
||||
end_time_list):
|
||||
def output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list):
|
||||
num_utts = len(utt_list)
|
||||
step = int(num_utts * 0.01)
|
||||
with open(output_wav_scp, 'w', encoding='UTF-8') as fout:
|
||||
with open(output_wav_scp, "w", encoding="UTF-8") as fout:
|
||||
previous_wav_path = ""
|
||||
for i in range(num_utts):
|
||||
utt_id = utt_list[i]
|
||||
current_wav_path = seg_path_list[i]
|
||||
output_dir = (os.path.dirname(current_wav_path)) \
|
||||
.replace("audio", 'audio_seg')
|
||||
seg_wav_path = os.path.join(output_dir, utt_id + '.wav')
|
||||
|
||||
# if not os.path.exists(output_dir):
|
||||
# os.makedirs(output_dir)
|
||||
output_dir = (os.path.dirname(current_wav_path)).replace(
|
||||
"audio", "audio_seg"
|
||||
)
|
||||
seg_wav_path = os.path.join(output_dir, utt_id + ".wav")
|
||||
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
if current_wav_path != previous_wav_path:
|
||||
source_wav = AudioSegment.from_file(current_wav_path)
|
||||
previous_wav_path = current_wav_path
|
||||
|
||||
start = int(start_time_list[i] * 1000)
|
||||
end = int(end_time_list[i] * 1000)
|
||||
target_audio = source_wav[start:end].set_frame_rate(16000) \
|
||||
.set_sample_width(2)
|
||||
target_audio = source_wav[start:end].set_frame_rate(16000)
|
||||
target_audio.export(seg_wav_path, format="wav")
|
||||
|
||||
fout.write("{} {}\n".format(utt_id, seg_wav_path))
|
||||
@ -80,11 +78,11 @@ def main():
|
||||
segments = sys.argv[2]
|
||||
output_wav_scp = sys.argv[3]
|
||||
|
||||
utt_list, seg_path_list, start_time_list, end_time_list \
|
||||
= read_file(wav_scp, segments)
|
||||
output(output_wav_scp, utt_list, seg_path_list, start_time_list,
|
||||
end_time_list)
|
||||
utt_list, seg_path_list, start_time_list, end_time_list = read_file(
|
||||
wav_scp, segments
|
||||
)
|
||||
output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
24
egs/wenetspeech/conformer/local/text_normalize.pl
Executable file
24
egs/wenetspeech/conformer/local/text_normalize.pl
Executable file
@ -0,0 +1,24 @@
|
||||
#!/usr/bin/env perl
|
||||
use utf8;
|
||||
use open qw(:std :utf8);
|
||||
use warnings;
|
||||
|
||||
while (<STDIN>) {
|
||||
chomp;
|
||||
# remove non UTF-8 whitespace character
|
||||
if ($_ =~ / /) {$_ =~ s: ::g;}
|
||||
if ($_ =~ / /) {$_ =~ s: ::g;}
|
||||
# upper letters
|
||||
if ($_ =~ /[a-zA-Z]/) {$_ =~ uc $_;}
|
||||
# add "_" before and after each English word
|
||||
if ($_ =~ /([A-Z]+)\s+([A-Z]+)/) {$_ =~ s/([A-Z]+)\s+([A-Z]+)/$1\_$2/g;}
|
||||
if ($_ =~ /([A-Z]+)\s+([A-Z]+)/) {$_ =~ s/([A-Z]+)\s+([A-Z]+)/$1\_$2/g;}
|
||||
if ($_ =~ m/([A-Z]+)(\p{Han}+)/) {$_ =~ s/([A-Z]+)(\p{Han}+)/$1\_$2/g;}
|
||||
if ($_ =~ m/(\p{Han}+)([A-Z]+)/) {$_ =~ s/(\p{Han}+)([A-Z]+)/$1\_$2/g;}
|
||||
# remove UTF-8 whitespace charcter
|
||||
if ($_ =~ /\s+/) {$_ =~ s:\s+::g;}
|
||||
# replace "_" with a normal whitespace
|
||||
if ($_ =~ /\_/) {$_ =~ s:\_: :g;}
|
||||
|
||||
print "$_\n";
|
||||
}
|
||||
@ -24,7 +24,7 @@ stage=1
|
||||
prefix=
|
||||
train_subset=L
|
||||
|
||||
. ./utils/parse_options.sh || exit 1;
|
||||
. utils/parse_options.sh || exit 1;
|
||||
|
||||
filter_by_id () {
|
||||
idlist=$1
|
||||
|
||||
@ -41,6 +41,7 @@ set -e
|
||||
set -u
|
||||
set -o pipefail
|
||||
|
||||
set=L
|
||||
train_set=train_l
|
||||
valid_set=dev
|
||||
test_sets="dev test_net test_meeting"
|
||||
@ -71,15 +72,15 @@ fi
|
||||
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
|
||||
echo "stage 0: Data preparation"
|
||||
# Data preparation
|
||||
local/wenetspeech_data_prep.sh $raw_data $feats_dir
|
||||
mkdir $feats_dir/data
|
||||
mv $feats_dir/$train_set $feats_dir/data/$train_set
|
||||
for x in $test_sets; do
|
||||
mv $feats_dir/$x $feats_dir/data/
|
||||
done
|
||||
local/data.sh "--set ${set}"
|
||||
# mkdir $feats_dir/data
|
||||
# mv $feats_dir/$train_set $feats_dir/data/$train_set
|
||||
# for x in $test_sets; do
|
||||
# mv $feats_dir/$x $feats_dir/data/
|
||||
# done
|
||||
fi
|
||||
|
||||
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
echo "stage 1: Feature and CMVN Generation"
|
||||
utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 0.1
|
||||
fi
|
||||
#if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
|
||||
# echo "stage 1: Feature and CMVN Generation"
|
||||
# utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 0.1
|
||||
#fi
|
||||
|
||||
Loading…
Reference in New Issue
Block a user