mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
update asr postprocess_utils
This commit is contained in:
parent
6161129cce
commit
a4de8b2a0a
@ -488,6 +488,7 @@ def inference_paraformer_vad_punc(
|
||||
):
|
||||
ncpu = kwargs.get("ncpu", 1)
|
||||
torch.set_num_threads(ncpu)
|
||||
language = kwargs.get("model_lang", None)
|
||||
|
||||
if word_lm_train_config is not None:
|
||||
raise NotImplementedError("Word LM is not implemented")
|
||||
@ -694,10 +695,13 @@ def inference_paraformer_vad_punc(
|
||||
text, token, token_int = result[0], result[1], result[2]
|
||||
time_stamp = result[4] if len(result[4]) > 0 else None
|
||||
|
||||
if use_timestamp and time_stamp is not None and len(time_stamp):
|
||||
postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
|
||||
if language == "en-bpe":
|
||||
postprocessed_result = postprocess_utils.sentence_postprocess_sentencepiece(token)
|
||||
else:
|
||||
postprocessed_result = postprocess_utils.sentence_postprocess(token)
|
||||
if use_timestamp and time_stamp is not None and len(time_stamp):
|
||||
postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
|
||||
else:
|
||||
postprocessed_result = postprocess_utils.sentence_postprocess(token)
|
||||
text_postprocessed = ""
|
||||
time_stamp_postprocessed = ""
|
||||
text_postprocessed_punc = postprocessed_result
|
||||
|
||||
@ -242,4 +242,55 @@ def sentence_postprocess(words: List[Any], time_stamp: List[List] = None):
|
||||
if ch != ' ':
|
||||
real_word_lists.append(ch)
|
||||
sentence = ''.join(word_lists).strip()
|
||||
return sentence, real_word_lists
|
||||
return sentence, real_word_lists
|
||||
|
||||
def sentence_postprocess_sentencepiece(words):
|
||||
middle_lists = []
|
||||
word_lists = []
|
||||
word_item = ''
|
||||
|
||||
# wash words lists
|
||||
for i in words:
|
||||
word = ''
|
||||
if isinstance(i, str):
|
||||
word = i
|
||||
else:
|
||||
word = i.decode('utf-8')
|
||||
|
||||
if word in ['<s>', '</s>', '<unk>', '<OOV>']:
|
||||
continue
|
||||
else:
|
||||
middle_lists.append(word)
|
||||
|
||||
# all alpha characters
|
||||
for i, ch in enumerate(middle_lists):
|
||||
word = ''
|
||||
if '\u2581' in ch and i == 0:
|
||||
word_item = ''
|
||||
word = ch.replace('\u2581', '')
|
||||
word_item += word
|
||||
elif '\u2581' in ch and i != 0:
|
||||
word_lists.append(word_item)
|
||||
word_lists.append(' ')
|
||||
word_item = ''
|
||||
word = ch.replace('\u2581', '')
|
||||
word_item += word
|
||||
else:
|
||||
word_item += ch
|
||||
if word_item is not None:
|
||||
word_lists.append(word_item)
|
||||
#word_lists = abbr_dispose(word_lists)
|
||||
real_word_lists = []
|
||||
for ch in word_lists:
|
||||
if ch != ' ':
|
||||
if ch == "i":
|
||||
ch = ch.replace("i", "I")
|
||||
elif ch == "i'm":
|
||||
ch = ch.replace("i'm", "I'm")
|
||||
elif ch == "i've":
|
||||
ch = ch.replace("i've", "I've")
|
||||
elif ch == "i'll":
|
||||
ch = ch.replace("i'll", "I'll")
|
||||
real_word_lists.append(ch)
|
||||
sentence = ''.join(word_lists)
|
||||
return sentence, real_word_lists
|
||||
Loading…
Reference in New Issue
Block a user