mirror of
https://github.com/modelscope/FunASR
synced 2025-09-15 14:48:36 +08:00
Merge branch 'dev_gzf_deepspeed' of gitlab.alibaba-inc.com:zhifu.gzf/FunASR into dev_gzf_deepspeed
This commit is contained in:
commit
397977f4ae
@ -29,6 +29,7 @@ parser.add_argument("--ngpu", type=int, default=1, help="0 for cpu, 1 for gpu")
|
|||||||
parser.add_argument("--device", type=str, default="cuda", help="cuda, cpu")
|
parser.add_argument("--device", type=str, default="cuda", help="cuda, cpu")
|
||||||
parser.add_argument("--ncpu", type=int, default=4, help="cpu cores")
|
parser.add_argument("--ncpu", type=int, default=4, help="cpu cores")
|
||||||
parser.add_argument("--return_sentence", action="store_true", help="return sentence or all_res")
|
parser.add_argument("--return_sentence", action="store_true", help="return sentence or all_res")
|
||||||
|
parser.add_argument("--no_vad", action="store_true", help="infer without vad")
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--certfile",
|
"--certfile",
|
||||||
type=str,
|
type=str,
|
||||||
@ -78,6 +79,7 @@ audio_encoder_dir = snapshot_download("iic/SenseVoice", cache_dir=None, revision
|
|||||||
# audio_encoder_dir = "/nfs/yangyexin.yyx/init_model/iic/SenseVoiceModelscope_0712"
|
# audio_encoder_dir = "/nfs/yangyexin.yyx/init_model/iic/SenseVoiceModelscope_0712"
|
||||||
device = "cuda:0"
|
device = "cuda:0"
|
||||||
all_file_paths = [
|
all_file_paths = [
|
||||||
|
"/nfs/yangyexin.yyx/init_model/s2tt/qwen2_7b_mmt_v15_20240910_streaming",
|
||||||
"FunAudioLLM/qwen2_7b_mmt_v15_20240910_streaming",
|
"FunAudioLLM/qwen2_7b_mmt_v15_20240910_streaming",
|
||||||
"FunAudioLLM/qwen2_7b_mmt_v15_20240902",
|
"FunAudioLLM/qwen2_7b_mmt_v15_20240902",
|
||||||
"FunAudioLLM/qwen2_7b_mmt_v14_20240830",
|
"FunAudioLLM/qwen2_7b_mmt_v14_20240830",
|
||||||
@ -91,7 +93,6 @@ llm_kwargs = {"num_beams": 1, "do_sample": False, "repetition_penalty": 1.3}
|
|||||||
UNFIX_LEN = 5
|
UNFIX_LEN = 5
|
||||||
MIN_LEN_PER_PARAGRAPH = 25
|
MIN_LEN_PER_PARAGRAPH = 25
|
||||||
MIN_LEN_SEC_AUDIO_FIX = 1.1
|
MIN_LEN_SEC_AUDIO_FIX = 1.1
|
||||||
MAX_ITER_PER_CHUNK = 20
|
|
||||||
|
|
||||||
ckpt_dir = all_file_paths[0]
|
ckpt_dir = all_file_paths[0]
|
||||||
|
|
||||||
@ -483,19 +484,27 @@ async def ws_serve(websocket, path):
|
|||||||
frames_asr.append(message)
|
frames_asr.append(message)
|
||||||
|
|
||||||
# vad online
|
# vad online
|
||||||
|
if not args.no_vad:
|
||||||
try:
|
try:
|
||||||
speech_start_i, speech_end_i = await async_vad(websocket, message)
|
speech_start_i, speech_end_i = await async_vad(websocket, message)
|
||||||
except:
|
except:
|
||||||
print("error in vad")
|
print("error in vad")
|
||||||
if speech_start_i != -1:
|
if speech_start_i != -1:
|
||||||
speech_start = True
|
speech_start = True
|
||||||
|
speech_end_i = -1
|
||||||
beg_bias = (websocket.vad_pre_idx - speech_start_i) // duration_ms
|
beg_bias = (websocket.vad_pre_idx - speech_start_i) // duration_ms
|
||||||
frames_pre = frames[-beg_bias:]
|
frames_pre = frames[-beg_bias:]
|
||||||
frames_asr = []
|
frames_asr = []
|
||||||
frames_asr.extend(frames_pre)
|
frames_asr.extend(frames_pre)
|
||||||
|
else:
|
||||||
|
speech_start = True
|
||||||
|
speech_end_i = -1
|
||||||
|
frames_asr = []
|
||||||
|
frames_asr.extend(frames)
|
||||||
|
|
||||||
# vad end
|
# vad end
|
||||||
if speech_end_i != -1 or not websocket.is_speaking:
|
if speech_end_i != -1 or not websocket.is_speaking:
|
||||||
|
if speech_end_i != -1:
|
||||||
audio_in = b"".join(frames_asr)
|
audio_in = b"".join(frames_asr)
|
||||||
try:
|
try:
|
||||||
await streaming_transcribe(
|
await streaming_transcribe(
|
||||||
@ -509,6 +518,17 @@ async def ws_serve(websocket, path):
|
|||||||
websocket.streaming_state["previous_asr_text"] = ""
|
websocket.streaming_state["previous_asr_text"] = ""
|
||||||
websocket.streaming_state["previous_s2tt_text"] = ""
|
websocket.streaming_state["previous_s2tt_text"] = ""
|
||||||
if not websocket.is_speaking:
|
if not websocket.is_speaking:
|
||||||
|
message = json.dumps(
|
||||||
|
{
|
||||||
|
"mode": "online",
|
||||||
|
"asr_text": websocket.streaming_state["onscreen_asr_res"] + "<em></em>",
|
||||||
|
"s2tt_text": websocket.streaming_state["onscreen_s2tt_res"] + "<em></em>",
|
||||||
|
"wav_name": websocket.wav_name,
|
||||||
|
"is_final": websocket.is_speaking,
|
||||||
|
"is_sentence_end": True,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
await websocket.send(message)
|
||||||
await clear_websocket()
|
await clear_websocket()
|
||||||
if args.return_sentence:
|
if args.return_sentence:
|
||||||
websocket.streaming_state["previous_vad_onscreen_asr_text"] = ""
|
websocket.streaming_state["previous_vad_onscreen_asr_text"] = ""
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user