mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
sensevoice2jsonl.py punctuation matching fix (#2533)
* fix sensevoice2jsonl.py punctuation check * fix sensevoice2jsonl.py punc check
This commit is contained in:
parent
ab2148ec18
commit
3445cd9652
@ -4,6 +4,7 @@ import torch
|
||||
import logging
|
||||
import hydra
|
||||
import re
|
||||
import string
|
||||
from omegaconf import DictConfig, OmegaConf
|
||||
import concurrent.futures
|
||||
import librosa
|
||||
@ -119,8 +120,11 @@ def gen_jsonl_from_wav_text_list(
|
||||
dist.barrier()
|
||||
|
||||
def contains_punctuation(s):
|
||||
pattern = r'[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
|
||||
return re.search(pattern, s) is not None
|
||||
punctuations = (
|
||||
string.punctuation +
|
||||
',。、;:?!""''()【】《》〈〉「」『』〔〕[]{}~·…—–'
|
||||
)
|
||||
return any(char in punctuations for char in s)
|
||||
|
||||
def parse_context_length(data_list: list, data_type: str, id=0):
|
||||
pbar = tqdm(total=len(data_list), dynamic_ncols=True)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user