update repo

2025-09-15 14:48:36 +08:00 · 2023-05-26 11:51:07 +08:00 · 2023-05-26 11:51:07 +08:00 · 167bab54bb
commit 167bab54bb
parent 3a15e5392b
2 changed files with 7 additions and 1 deletions
--- a/egs/wenetspeech/conformer/conf/train_asr_conformer.yaml
+++ b/egs/wenetspeech/conformer/conf/train_asr_conformer.yaml
@ -90,7 +90,7 @@ specaug_conf:

 dataset_conf:
    data_names: speech,text
-    data_types: sound,text
+    data_types: sound,text_nospace
    shuffle: True
    shuffle_conf:
        shuffle_size: 2048
--- a/funasr/datasets/large_datasets/dataset.py
+++ b/funasr/datasets/large_datasets/dataset.py
@ -148,6 +148,12 @@ class AudioDataset(IterableDataset):
                        if "key" not in sample_dict:
                            sample_dict["key"] = segs[0]
                        sample_dict['hw_tag'] = 1
+                    elif data_type == "text_nospace":
+                        text = item
+                        segs = text.strip().split(maxsplit=1)
+                        sample_dict[data_name] = [x for x in segs[1]]
+                        if "key" not in sample_dict:
+                            sample_dict["key"] = segs[0]
                    else:
                        text = item
                        segs = text.strip().split()