fix text postprocess bug

2025-09-15 14:48:36 +08:00 · 2023-03-01 11:14:09 +08:00 · 2023-03-01 11:14:09 +08:00 · e9ea65679a
commit e9ea65679a
parent b6a1c6c1e6
1 changed files with 6 additions and 2 deletions
--- a/funasr/utils/postprocess_utils.py
+++ b/funasr/utils/postprocess_utils.py
@ -6,7 +6,7 @@ from typing import Any, List, Union


 def isChinese(ch: str):
-    if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039':
+    if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039' or ch == '@':
        return True
    return False

@ -17,6 +17,8 @@ def isAllChinese(word: Union[List[Any], str]):
        cur = i.replace(' ', '')
        cur = cur.replace('</s>', '')
        cur = cur.replace('<s>', '')
+        cur = cur.replace('<unk>', '')
+        cur = cur.replace('<OOV>', '')
        word_lists.append(cur)

    if len(word_lists) == 0:
@ -34,6 +36,8 @@ def isAllAlpha(word: Union[List[Any], str]):
        cur = i.replace(' ', '')
        cur = cur.replace('</s>', '')
        cur = cur.replace('<s>', '')
+        cur = cur.replace('<unk>', '')
+        cur = cur.replace('<OOV>', '')
        word_lists.append(cur)

    if len(word_lists) == 0:
@ -144,7 +148,7 @@ def sentence_postprocess(words: List[Any], time_stamp: List[List] = None):
        else:
            word = i.decode('utf-8')

-        if word in ['<s>', '</s>', '<unk>']:
+        if word in ['<s>', '</s>', '<unk>', '<OOV>']:
            continue
        else:
            middle_lists.append(word)