From a4de8b2a0a69ba42c58d6bacb9c9108539a1e280 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8C=97=E5=BF=B5?= Date: Tue, 10 Oct 2023 15:49:04 +0800 Subject: [PATCH] update asr postprocess_utils --- funasr/bin/asr_inference_launch.py | 10 ++++-- funasr/utils/postprocess_utils.py | 53 +++++++++++++++++++++++++++++- 2 files changed, 59 insertions(+), 4 deletions(-) diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py index 50b988602..369598076 100644 --- a/funasr/bin/asr_inference_launch.py +++ b/funasr/bin/asr_inference_launch.py @@ -488,6 +488,7 @@ def inference_paraformer_vad_punc( ): ncpu = kwargs.get("ncpu", 1) torch.set_num_threads(ncpu) + language = kwargs.get("model_lang", None) if word_lm_train_config is not None: raise NotImplementedError("Word LM is not implemented") @@ -694,10 +695,13 @@ def inference_paraformer_vad_punc( text, token, token_int = result[0], result[1], result[2] time_stamp = result[4] if len(result[4]) > 0 else None - if use_timestamp and time_stamp is not None and len(time_stamp): - postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp) + if language == "en-bpe": + postprocessed_result = postprocess_utils.sentence_postprocess_sentencepiece(token) else: - postprocessed_result = postprocess_utils.sentence_postprocess(token) + if use_timestamp and time_stamp is not None and len(time_stamp): + postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp) + else: + postprocessed_result = postprocess_utils.sentence_postprocess(token) text_postprocessed = "" time_stamp_postprocessed = "" text_postprocessed_punc = postprocessed_result diff --git a/funasr/utils/postprocess_utils.py b/funasr/utils/postprocess_utils.py index f4efea66f..efba7551a 100644 --- a/funasr/utils/postprocess_utils.py +++ b/funasr/utils/postprocess_utils.py @@ -242,4 +242,55 @@ def sentence_postprocess(words: List[Any], time_stamp: List[List] = None): if ch != ' ': real_word_lists.append(ch) sentence = ''.join(word_lists).strip() - return sentence, real_word_lists \ No newline at end of file + return sentence, real_word_lists + +def sentence_postprocess_sentencepiece(words): + middle_lists = [] + word_lists = [] + word_item = '' + + # wash words lists + for i in words: + word = '' + if isinstance(i, str): + word = i + else: + word = i.decode('utf-8') + + if word in ['', '', '', '']: + continue + else: + middle_lists.append(word) + + # all alpha characters + for i, ch in enumerate(middle_lists): + word = '' + if '\u2581' in ch and i == 0: + word_item = '' + word = ch.replace('\u2581', '') + word_item += word + elif '\u2581' in ch and i != 0: + word_lists.append(word_item) + word_lists.append(' ') + word_item = '' + word = ch.replace('\u2581', '') + word_item += word + else: + word_item += ch + if word_item is not None: + word_lists.append(word_item) + #word_lists = abbr_dispose(word_lists) + real_word_lists = [] + for ch in word_lists: + if ch != ' ': + if ch == "i": + ch = ch.replace("i", "I") + elif ch == "i'm": + ch = ch.replace("i'm", "I'm") + elif ch == "i've": + ch = ch.replace("i've", "I've") + elif ch == "i'll": + ch = ch.replace("i'll", "I'll") + real_word_lists.append(ch) + sentence = ''.join(word_lists) + return sentence, real_word_lists \ No newline at end of file