update asr postprocess_utils

This commit is contained in:
北念 2023-10-10 15:49:04 +08:00
parent 6161129cce
commit a4de8b2a0a
2 changed files with 59 additions and 4 deletions

View File

@ -488,6 +488,7 @@ def inference_paraformer_vad_punc(
):
ncpu = kwargs.get("ncpu", 1)
torch.set_num_threads(ncpu)
language = kwargs.get("model_lang", None)
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
@ -694,10 +695,13 @@ def inference_paraformer_vad_punc(
text, token, token_int = result[0], result[1], result[2]
time_stamp = result[4] if len(result[4]) > 0 else None
if use_timestamp and time_stamp is not None and len(time_stamp):
postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
if language == "en-bpe":
postprocessed_result = postprocess_utils.sentence_postprocess_sentencepiece(token)
else:
postprocessed_result = postprocess_utils.sentence_postprocess(token)
if use_timestamp and time_stamp is not None and len(time_stamp):
postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
else:
postprocessed_result = postprocess_utils.sentence_postprocess(token)
text_postprocessed = ""
time_stamp_postprocessed = ""
text_postprocessed_punc = postprocessed_result

View File

@ -242,4 +242,55 @@ def sentence_postprocess(words: List[Any], time_stamp: List[List] = None):
if ch != ' ':
real_word_lists.append(ch)
sentence = ''.join(word_lists).strip()
return sentence, real_word_lists
return sentence, real_word_lists
def sentence_postprocess_sentencepiece(words):
middle_lists = []
word_lists = []
word_item = ''
# wash words lists
for i in words:
word = ''
if isinstance(i, str):
word = i
else:
word = i.decode('utf-8')
if word in ['<s>', '</s>', '<unk>', '<OOV>']:
continue
else:
middle_lists.append(word)
# all alpha characters
for i, ch in enumerate(middle_lists):
word = ''
if '\u2581' in ch and i == 0:
word_item = ''
word = ch.replace('\u2581', '')
word_item += word
elif '\u2581' in ch and i != 0:
word_lists.append(word_item)
word_lists.append(' ')
word_item = ''
word = ch.replace('\u2581', '')
word_item += word
else:
word_item += ch
if word_item is not None:
word_lists.append(word_item)
#word_lists = abbr_dispose(word_lists)
real_word_lists = []
for ch in word_lists:
if ch != ' ':
if ch == "i":
ch = ch.replace("i", "I")
elif ch == "i'm":
ch = ch.replace("i'm", "I'm")
elif ch == "i've":
ch = ch.replace("i've", "I've")
elif ch == "i'll":
ch = ch.replace("i'll", "I'll")
real_word_lists.append(ch)
sentence = ''.join(word_lists)
return sentence, real_word_lists