FunASR/funasr/datasets/ms_dataset.py
2023-01-16 18:46:40 +08:00

34 lines
1.3 KiB
Python

import os
class MsDataset(object):
@classmethod
def load_core(cls, data_dir, data_set):
wav_file = os.path.join(data_dir, data_set, "wav.scp")
text_file = os.path.join(data_dir, data_set, "text")
with open(wav_file) as f:
wav_lines = f.readlines()
with open(text_file) as f:
text_lines = f.readlines()
data_list = []
for wav_line, text_line in zip(wav_lines, text_lines):
item = {}
item["Audio:FILE"] = wav_line.strip().split()[-1]
item["Text:LABEL"] = " ".join(text_line.strip().split()[1:])
data_list.append(item)
return data_list
@classmethod
def load(cls, dataset_name, namespace="speech_asr", train_set="train", dev_set="validation"):
if os.path.exists(dataset_name):
data_dir = dataset_name
ds_dict = {}
ds_dict["train"] = cls.load_core(data_dir, train_set)
ds_dict["validation"] = cls.load_core(data_dir, dev_set)
ds_dict["raw_data_dir"] = data_dir
return ds_dict
else:
from modelscope.msdatasets import MsDataset
ds_dict = MsDataset.load(dataset_name=dataset_name, namespace=namespace)
return ds_dict