From 883bd3fedaedf23044848c679a6a5340b61f78c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=98=89=E6=B8=8A?= Date: Thu, 25 May 2023 14:43:54 +0800 Subject: [PATCH] update repo --- .../conformer/local/extract_meta.py | 102 ++++++++++++++++++ .../conformer/local/process_opus.py | 90 ++++++++++++++++ .../conformer/local/wenetspeech_data_prep.sh | 4 +- 3 files changed, 194 insertions(+), 2 deletions(-) create mode 100755 egs/wenetspeech/conformer/local/extract_meta.py create mode 100755 egs/wenetspeech/conformer/local/process_opus.py diff --git a/egs/wenetspeech/conformer/local/extract_meta.py b/egs/wenetspeech/conformer/local/extract_meta.py new file mode 100755 index 000000000..ce2871d0b --- /dev/null +++ b/egs/wenetspeech/conformer/local/extract_meta.py @@ -0,0 +1,102 @@ +# Copyright 2021 Xiaomi Corporation (Author: Yongqing Wang) +# Mobvoi Inc(Author: Di Wu, Binbin Zhang) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os +import argparse +import json + + +def get_args(): + parser = argparse.ArgumentParser(description=""" + This script is used to process raw json dataset of WenetSpeech, + where the long wav is splitinto segments and + data of wenet format is generated. + """) + parser.add_argument('input_json', help="""Input json file of WenetSpeech""") + parser.add_argument('output_dir', help="""Output dir for prepared data""") + + args = parser.parse_args() + return args + + +def meta_analysis(input_json, output_dir): + input_dir = os.path.dirname(input_json) + + if not os.path.exists(output_dir): + os.makedirs(output_dir) + + try: + with open(input_json, 'r') as injson: + json_data = json.load(injson) + except Exception: + sys.exit(f'Failed to load input json file: {input_json}') + else: + if json_data['audios'] is not None: + with open(f'{output_dir}/text', 'w') as utt2text, \ + open(f'{output_dir}/segments', 'w') as segments, \ + open(f'{output_dir}/utt2dur', 'w') as utt2dur, \ + open(f'{output_dir}/wav.scp', 'w') as wavscp, \ + open(f'{output_dir}/utt2subsets', 'w') as utt2subsets, \ + open(f'{output_dir}/reco2dur', 'w') as reco2dur: + for long_audio in json_data['audios']: + try: + long_audio_path = os.path.realpath( + os.path.join(input_dir, long_audio['path'])) + aid = long_audio['aid'] + segments_lists = long_audio['segments'] + duration = long_audio['duration'] + assert (os.path.exists(long_audio_path)) + except AssertionError: + print(f'''Warning: {aid} something is wrong, + maybe AssertionError, skipped''') + continue + except Exception: + print(f'''Warning: {aid} something is wrong, maybe the + error path: {long_audio_path}, skipped''') + continue + else: + wavscp.write(f'{aid}\t{long_audio_path}\n') + reco2dur.write(f'{aid}\t{duration}\n') + for segment_file in segments_lists: + try: + sid = segment_file['sid'] + start_time = segment_file['begin_time'] + end_time = segment_file['end_time'] + dur = end_time - start_time + text = segment_file['text'] + segment_subsets = segment_file["subsets"] + except Exception: + print(f'''Warning: {segment_file} something + is wrong, skipped''') + continue + else: + utt2text.write(f'{sid}\t{text}\n') + segments.write( + f'{sid}\t{aid}\t{start_time}\t{end_time}\n' + ) + utt2dur.write(f'{sid}\t{dur}\n') + segment_sub_names = " ".join(segment_subsets) + utt2subsets.write( + f'{sid}\t{segment_sub_names}\n') + +def main(): + args = get_args() + + meta_analysis(args.input_json, args.output_dir) + + +if __name__ == '__main__': + main() diff --git a/egs/wenetspeech/conformer/local/process_opus.py b/egs/wenetspeech/conformer/local/process_opus.py new file mode 100755 index 000000000..9f71eb1a6 --- /dev/null +++ b/egs/wenetspeech/conformer/local/process_opus.py @@ -0,0 +1,90 @@ +# Copyright 2021 NPU, ASLP Group (Author: Qijie Shao) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# process_opus.py: segmentation and downsampling of opus audio + +# usage: python3 process_opus.py wav.scp segments output_wav.scp + +from pydub import AudioSegment +import sys +import os + + +def read_file(wav_scp, segments): + wav_scp_dict = {} + with open(wav_scp, 'r', encoding='UTF-8') as fin: + for line_str in fin: + wav_id, path = line_str.strip().split() + wav_scp_dict[wav_id] = path + + utt_list = [] + seg_path_list = [] + start_time_list = [] + end_time_list = [] + with open(segments, 'r', encoding='UTF-8') as fin: + for line_str in fin: + arr = line_str.strip().split() + assert len(arr) == 4 + utt_list.append(arr[0]) + seg_path_list.append(wav_scp_dict[arr[1]]) + start_time_list.append(float(arr[2])) + end_time_list.append(float(arr[3])) + return utt_list, seg_path_list, start_time_list, end_time_list + + +# TODO(Qijie): Fix the process logic +def output(output_wav_scp, utt_list, seg_path_list, start_time_list, + end_time_list): + num_utts = len(utt_list) + step = int(num_utts * 0.01) + with open(output_wav_scp, 'w', encoding='UTF-8') as fout: + previous_wav_path = "" + for i in range(num_utts): + utt_id = utt_list[i] + current_wav_path = seg_path_list[i] + output_dir = (os.path.dirname(current_wav_path)) \ + .replace("audio", 'audio_seg') + seg_wav_path = os.path.join(output_dir, utt_id + '.wav') + + # if not os.path.exists(output_dir): + # os.makedirs(output_dir) + + if current_wav_path != previous_wav_path: + source_wav = AudioSegment.from_file(current_wav_path) + previous_wav_path = current_wav_path + + start = int(start_time_list[i] * 1000) + end = int(end_time_list[i] * 1000) + target_audio = source_wav[start:end].set_frame_rate(16000) \ + .set_sample_width(2) + target_audio.export(seg_wav_path, format="wav") + + fout.write("{} {}\n".format(utt_id, seg_wav_path)) + if i % step == 0: + print("seg wav finished: {}%".format(int(i / step))) + + +def main(): + wav_scp = sys.argv[1] + segments = sys.argv[2] + output_wav_scp = sys.argv[3] + + utt_list, seg_path_list, start_time_list, end_time_list \ + = read_file(wav_scp, segments) + output(output_wav_scp, utt_list, seg_path_list, start_time_list, + end_time_list) + + +if __name__ == '__main__': + main() diff --git a/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh b/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh index 0bfd96fdf..0fd3b5bc3 100755 --- a/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh +++ b/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh @@ -24,7 +24,7 @@ stage=1 prefix= train_subset=L -. ./utils/parse_options.sh || exit 1; +. ./tools/parse_options.sh || exit 1; filter_by_id () { idlist=$1 @@ -132,4 +132,4 @@ if [ $stage -le 2 ]; then done fi -echo "$0: Done" \ No newline at end of file +echo "$0: Done"