mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
sensevoice
This commit is contained in:
parent
f097706c40
commit
8b74979bd3
@ -3,26 +3,82 @@
|
||||
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
|
||||
# MIT License (https://opensource.org/licenses/MIT)
|
||||
|
||||
import sys
|
||||
|
||||
from funasr import AutoModel
|
||||
from funasr.utils.postprocess_utils import rich_transcription_postprocess
|
||||
|
||||
model_dir = "iic/SenseVoiceSmall"
|
||||
input_file = (
|
||||
"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav"
|
||||
)
|
||||
|
||||
|
||||
model = AutoModel(
|
||||
model=model_dir,
|
||||
vad_model="fsmn-vad",
|
||||
vad_kwargs={"max_single_segment_time": 30000},
|
||||
device="cpu",
|
||||
)
|
||||
|
||||
# en
|
||||
res = model.generate(
|
||||
input=input_file,
|
||||
input=f"{model.model_path}/example/en.mp3",
|
||||
cache={},
|
||||
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
|
||||
use_itn=False,
|
||||
use_itn=True,
|
||||
batch_size_s=60,
|
||||
merge_vad=True, #
|
||||
merge_length_s=15,
|
||||
)
|
||||
text = rich_transcription_postprocess(res[0]["text"])
|
||||
print(text)
|
||||
|
||||
# zh
|
||||
res = model.generate(
|
||||
input=f"{model.model_path}/example/zh.mp3",
|
||||
cache={},
|
||||
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
|
||||
use_itn=True,
|
||||
batch_size_s=60,
|
||||
merge_vad=True, #
|
||||
merge_length_s=15,
|
||||
)
|
||||
text = rich_transcription_postprocess(res[0]["text"])
|
||||
print(text)
|
||||
|
||||
# yue
|
||||
res = model.generate(
|
||||
input=f"{model.model_path}/example/yue.mp3",
|
||||
cache={},
|
||||
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
|
||||
use_itn=True,
|
||||
batch_size_s=60,
|
||||
merge_vad=True, #
|
||||
merge_length_s=15,
|
||||
)
|
||||
text = rich_transcription_postprocess(res[0]["text"])
|
||||
print(text)
|
||||
|
||||
# ja
|
||||
res = model.generate(
|
||||
input=f"{model.model_path}/example/ja.mp3",
|
||||
cache={},
|
||||
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
|
||||
use_itn=True,
|
||||
batch_size_s=60,
|
||||
merge_vad=True, #
|
||||
merge_length_s=15,
|
||||
)
|
||||
text = rich_transcription_postprocess(res[0]["text"])
|
||||
print(text)
|
||||
|
||||
|
||||
# ko
|
||||
res = model.generate(
|
||||
input=f"{model.model_path}/example/ko.mp3",
|
||||
cache={},
|
||||
language="auto", # "zn", "en", "yue", "ja", "ko", "nospeech"
|
||||
use_itn=True,
|
||||
batch_size_s=60,
|
||||
merge_vad=True, #
|
||||
merge_length_s=15,
|
||||
)
|
||||
|
||||
text = rich_transcription_postprocess(res[0]["text"])
|
||||
|
||||
print(text)
|
||||
|
||||
@ -339,7 +339,9 @@ class AutoModel:
|
||||
# FIX(gcf): concat the vad clips for sense vocie model for better aed
|
||||
if kwargs.get("merge_vad", False):
|
||||
for i in range(len(res)):
|
||||
res[i]["value"] = merge_vad(res[i]["value"], kwargs.get("merge_length", 15000))
|
||||
res[i]["value"] = merge_vad(
|
||||
res[i]["value"], kwargs.get("merge_length_s", 15) * 1000
|
||||
)
|
||||
|
||||
# step.2 compute asr model
|
||||
model = self.model
|
||||
@ -380,6 +382,9 @@ class AutoModel:
|
||||
if len(sorted_data) > 0 and len(sorted_data[0]) > 0:
|
||||
batch_size = max(batch_size, sorted_data[0][0][1] - sorted_data[0][0][0])
|
||||
|
||||
if kwargs["device"] == "cpu":
|
||||
batch_size = 0
|
||||
|
||||
beg_idx = 0
|
||||
beg_asr_total = time.time()
|
||||
time_speech_total_per_sample = speech_lengths / 16000
|
||||
|
||||
Loading…
Reference in New Issue
Block a user