add wav/text mismatch process

This commit is contained in:
speech_asr 2023-02-14 14:45:39 +08:00
parent f79b6decc9
commit 66a8235fbf

View File

@ -298,11 +298,17 @@ def filter_wav_text(data_dir, dataset):
os.rename(text_file, "{}.bak".format(text_file))
wav_dict = {}
for line in wav_lines:
sample_name, wav_path = line.strip().split()
parts = line.strip().split()
if len(parts) < 2:
continue
sample_name, wav_path = parts
wav_dict[sample_name] = wav_path
text_dict = {}
for line in text_lines:
sample_name, txt = line.strip().split(" ", 1)
parts = line.strip().split(" ", 1)
if len(parts) < 2:
continue
sample_name, txt = parts
text_dict[sample_name] = txt
filter_count = 0
with open(wav_file) as f_wav, open(text_file) as f_text: