From 8ce7dad0578704fab2824edea997b7fef5674707 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=A8=E5=AE=88?= Date: Wed, 11 Sep 2024 17:34:33 +0800 Subject: [PATCH 1/2] update --- .../funasr_wss_server_streaming_llm.py | 45 +++++++++++++------ 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/runtime/python/websocket/funasr_wss_server_streaming_llm.py b/runtime/python/websocket/funasr_wss_server_streaming_llm.py index fb1bada7a..f7a2e7c86 100644 --- a/runtime/python/websocket/funasr_wss_server_streaming_llm.py +++ b/runtime/python/websocket/funasr_wss_server_streaming_llm.py @@ -29,6 +29,7 @@ parser.add_argument("--ngpu", type=int, default=1, help="0 for cpu, 1 for gpu") parser.add_argument("--device", type=str, default="cuda", help="cuda, cpu") parser.add_argument("--ncpu", type=int, default=4, help="cpu cores") parser.add_argument("--return_sentence", action="store_true", help="return sentence or all_res") +parser.add_argument("--no_vad", action="store_true", help="infer without vad") parser.add_argument( "--certfile", type=str, @@ -483,12 +484,18 @@ async def ws_serve(websocket, path): frames_asr.append(message) # vad online - try: - speech_start_i, speech_end_i = await async_vad(websocket, message) - except: - print("error in vad") - if speech_start_i != -1: + if not args.no_vad: + try: + speech_start_i, speech_end_i = await async_vad(websocket, message) + except: + print("error in vad") + if speech_start_i != -1: + speech_start = True + frames_asr = [] + frames_asr.extend(frames) + else: speech_start = True + speech_end_i = -1 beg_bias = (websocket.vad_pre_idx - speech_start_i) // duration_ms frames_pre = frames[-beg_bias:] frames_asr = [] @@ -496,19 +503,31 @@ async def ws_serve(websocket, path): # vad end if speech_end_i != -1 or not websocket.is_speaking: - audio_in = b"".join(frames_asr) - try: - await streaming_transcribe( - websocket, audio_in, is_vad_end=True, asr_prompt=asr_prompt, s2tt_prompt=s2tt_prompt - ) - except Exception as e: - print(f"error in streaming, {e}") - print(f"error in streaming, {websocket.streaming_state}") + if speech_end_i != -1: + audio_in = b"".join(frames_asr) + try: + await streaming_transcribe( + websocket, audio_in, is_vad_end=True, asr_prompt=asr_prompt, s2tt_prompt=s2tt_prompt + ) + except Exception as e: + print(f"error in streaming, {e}") + print(f"error in streaming, {websocket.streaming_state}") frames_asr = [] speech_start = False websocket.streaming_state["previous_asr_text"] = "" websocket.streaming_state["previous_s2tt_text"] = "" if not websocket.is_speaking: + message = json.dumps( + { + "mode": "online", + "asr_text": websocket.streaming_state["onscreen_asr_res"] + "", + "s2tt_text": websocket.streaming_state["onscreen_s2tt_res"] + "", + "wav_name": websocket.wav_name, + "is_final": websocket.is_speaking, + "is_sentence_end": True, + } + ) + await websocket.send(message) await clear_websocket() if args.return_sentence: websocket.streaming_state["previous_vad_onscreen_asr_text"] = "" From 68c770f67cce8e11ae8ee6ab929068e365410b31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=9C=A8=E5=AE=88?= Date: Wed, 11 Sep 2024 17:57:48 +0800 Subject: [PATCH 2/2] update --- .../websocket/funasr_wss_server_streaming_llm.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/runtime/python/websocket/funasr_wss_server_streaming_llm.py b/runtime/python/websocket/funasr_wss_server_streaming_llm.py index f7a2e7c86..42c0c3d0b 100644 --- a/runtime/python/websocket/funasr_wss_server_streaming_llm.py +++ b/runtime/python/websocket/funasr_wss_server_streaming_llm.py @@ -79,6 +79,7 @@ audio_encoder_dir = snapshot_download("iic/SenseVoice", cache_dir=None, revision # audio_encoder_dir = "/nfs/yangyexin.yyx/init_model/iic/SenseVoiceModelscope_0712" device = "cuda:0" all_file_paths = [ + "/nfs/yangyexin.yyx/init_model/s2tt/qwen2_7b_mmt_v15_20240910_streaming", "FunAudioLLM/qwen2_7b_mmt_v15_20240910_streaming", "FunAudioLLM/qwen2_7b_mmt_v15_20240902", "FunAudioLLM/qwen2_7b_mmt_v14_20240830", @@ -92,7 +93,6 @@ llm_kwargs = {"num_beams": 1, "do_sample": False, "repetition_penalty": 1.3} UNFIX_LEN = 5 MIN_LEN_PER_PARAGRAPH = 25 MIN_LEN_SEC_AUDIO_FIX = 1.1 -MAX_ITER_PER_CHUNK = 20 ckpt_dir = all_file_paths[0] @@ -491,15 +491,16 @@ async def ws_serve(websocket, path): print("error in vad") if speech_start_i != -1: speech_start = True + speech_end_i = -1 + beg_bias = (websocket.vad_pre_idx - speech_start_i) // duration_ms + frames_pre = frames[-beg_bias:] frames_asr = [] - frames_asr.extend(frames) + frames_asr.extend(frames_pre) else: speech_start = True speech_end_i = -1 - beg_bias = (websocket.vad_pre_idx - speech_start_i) // duration_ms - frames_pre = frames[-beg_bias:] frames_asr = [] - frames_asr.extend(frames_pre) + frames_asr.extend(frames) # vad end if speech_end_i != -1 or not websocket.is_speaking: