mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
fix data dir filter bug
This commit is contained in:
parent
2432fc9c39
commit
b6a1c6c1e6
@ -305,11 +305,11 @@ def filter_wav_text(data_dir, dataset):
|
||||
wav_dict[sample_name] = wav_path
|
||||
text_dict = {}
|
||||
for line in text_lines:
|
||||
parts = line.strip().split(" ", 1)
|
||||
parts = line.strip().split()
|
||||
if len(parts) < 2:
|
||||
continue
|
||||
sample_name, txt = parts
|
||||
text_dict[sample_name] = txt
|
||||
sample_name = parts[0]
|
||||
text_dict[sample_name] = " ".join(parts[1:])
|
||||
filter_count = 0
|
||||
with open(wav_file, "w") as f_wav, open(text_file, "w") as f_text:
|
||||
for sample_name, wav_path in wav_dict.items():
|
||||
@ -318,4 +318,4 @@ def filter_wav_text(data_dir, dataset):
|
||||
f_text.write(sample_name + " " + text_dict[sample_name] + "\n")
|
||||
else:
|
||||
filter_count += 1
|
||||
print("{}/{} samples in {} are filtered because of the mismatch between wav.scp and text".format(len(wav_lines), filter_count, dataset))
|
||||
print("{}/{} samples in {} are filtered because of the mismatch between wav.scp and text".format(len(wav_lines), filter_count, dataset))
|
||||
|
||||
Loading…
Reference in New Issue
Block a user