diff --git a/funasr/datasets/audio_datasets/sensevoice2jsonl.py b/funasr/datasets/audio_datasets/sensevoice2jsonl.py index df6ceaea8..102e82797 100644 --- a/funasr/datasets/audio_datasets/sensevoice2jsonl.py +++ b/funasr/datasets/audio_datasets/sensevoice2jsonl.py @@ -4,6 +4,7 @@ import torch import logging import hydra import re +import string from omegaconf import DictConfig, OmegaConf import concurrent.futures import librosa @@ -119,8 +120,11 @@ def gen_jsonl_from_wav_text_list( dist.barrier() def contains_punctuation(s): - pattern = r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]' - return re.search(pattern, s) is not None + punctuations = ( + string.punctuation + + ',。、;:?!""''()【】《》〈〉「」『』〔〕[]{}~·…—–' + ) + return any(char in punctuations for char in s) def parse_context_length(data_list: list, data_type: str, id=0): pbar = tqdm(total=len(data_list), dynamic_ncols=True)