sensevoice2jsonl.py punctuation matching fix (#2533)

* fix sensevoice2jsonl.py punctuation check

* fix sensevoice2jsonl.py punc check
This commit is contained in:
yuGAN6 2025-05-28 10:33:26 +08:00 committed by GitHub
parent ab2148ec18
commit 3445cd9652
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -4,6 +4,7 @@ import torch
import logging import logging
import hydra import hydra
import re import re
import string
from omegaconf import DictConfig, OmegaConf from omegaconf import DictConfig, OmegaConf
import concurrent.futures import concurrent.futures
import librosa import librosa
@ -119,8 +120,11 @@ def gen_jsonl_from_wav_text_list(
dist.barrier() dist.barrier()
def contains_punctuation(s): def contains_punctuation(s):
pattern = r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]' punctuations = (
return re.search(pattern, s) is not None string.punctuation +
',。、;:?!""''()【】《》〈〉「」『』〔〕[]{}~·…—–'
)
return any(char in punctuations for char in s)
def parse_context_length(data_list: list, data_type: str, id=0): def parse_context_length(data_list: list, data_type: str, id=0):
pbar = tqdm(total=len(data_list), dynamic_ncols=True) pbar = tqdm(total=len(data_list), dynamic_ncols=True)