sensevoice2jsonl.py punctuation matching fix (#2533)

* fix sensevoice2jsonl.py punctuation check

* fix sensevoice2jsonl.py punc check
This commit is contained in:
yuGAN6 2025-05-28 10:33:26 +08:00 committed by GitHub
parent ab2148ec18
commit 3445cd9652
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -4,6 +4,7 @@ import torch
import logging
import hydra
import re
import string
from omegaconf import DictConfig, OmegaConf
import concurrent.futures
import librosa
@ -119,8 +120,11 @@ def gen_jsonl_from_wav_text_list(
dist.barrier()
def contains_punctuation(s):
pattern = r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
return re.search(pattern, s) is not None
punctuations = (
string.punctuation +
',。、;:?!""''()【】《》〈〉「」『』〔〕[]{}~·…—–'
)
return any(char in punctuations for char in s)
def parse_context_length(data_list: list, data_type: str, id=0):
pbar = tqdm(total=len(data_list), dynamic_ncols=True)