From 167bab54bbcc0e2b0143e0c2fedce06ee8326ad5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=98=89=E6=B8=8A?= Date: Fri, 26 May 2023 11:51:07 +0800 Subject: [PATCH] update repo --- egs/wenetspeech/conformer/conf/train_asr_conformer.yaml | 2 +- funasr/datasets/large_datasets/dataset.py | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/egs/wenetspeech/conformer/conf/train_asr_conformer.yaml b/egs/wenetspeech/conformer/conf/train_asr_conformer.yaml index 9842fa4f1..a9658b8eb 100644 --- a/egs/wenetspeech/conformer/conf/train_asr_conformer.yaml +++ b/egs/wenetspeech/conformer/conf/train_asr_conformer.yaml @@ -90,7 +90,7 @@ specaug_conf: dataset_conf: data_names: speech,text - data_types: sound,text + data_types: sound,text_nospace shuffle: True shuffle_conf: shuffle_size: 2048 diff --git a/funasr/datasets/large_datasets/dataset.py b/funasr/datasets/large_datasets/dataset.py index 5df61fdb0..68b63e137 100644 --- a/funasr/datasets/large_datasets/dataset.py +++ b/funasr/datasets/large_datasets/dataset.py @@ -148,6 +148,12 @@ class AudioDataset(IterableDataset): if "key" not in sample_dict: sample_dict["key"] = segs[0] sample_dict['hw_tag'] = 1 + elif data_type == "text_nospace": + text = item + segs = text.strip().split(maxsplit=1) + sample_dict[data_name] = [x for x in segs[1]] + if "key" not in sample_dict: + sample_dict["key"] = segs[0] else: text = item segs = text.strip().split()