mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
37 lines
1.0 KiB
Python
37 lines
1.0 KiB
Python
|
|
|
|
def split_to_mini_sentence(words: list, word_limit: int = 20):
|
|
assert word_limit > 1
|
|
if len(words) <= word_limit:
|
|
return [words]
|
|
sentences = []
|
|
length = len(words)
|
|
sentence_len = length // word_limit
|
|
for i in range(sentence_len):
|
|
sentences.append(words[i * word_limit:(i + 1) * word_limit])
|
|
if length % word_limit > 0:
|
|
sentences.append(words[sentence_len * word_limit:])
|
|
return sentences
|
|
|
|
|
|
def split_words(text: str):
|
|
words = []
|
|
segs = text.split()
|
|
for seg in segs:
|
|
# There is no space in seg.
|
|
current_word = ""
|
|
for c in seg:
|
|
if len(c.encode()) == 1:
|
|
# This is an ASCII char.
|
|
current_word += c
|
|
else:
|
|
# This is a Chinese char.
|
|
if len(current_word) > 0:
|
|
words.append(current_word)
|
|
current_word = ""
|
|
words.append(c)
|
|
if len(current_word) > 0:
|
|
words.append(current_word)
|
|
|
|
return words
|