From 8ce7dad0578704fab2824edea997b7fef5674707 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9C=A8=E5=AE=88?= <yangyexin.yyx@alibaba-inc.com>
Date: Wed, 11 Sep 2024 17:34:33 +0800
Subject: [PATCH 1/2] update

---
 .../funasr_wss_server_streaming_llm.py        | 45 +++++++++++++------
 1 file changed, 32 insertions(+), 13 deletions(-)
diff --git a/runtime/python/websocket/funasr_wss_server_streaming_llm.py b/runtime/python/websocket/funasr_wss_server_streaming_llm.py
index fb1bada7a..f7a2e7c86 100644
--- a/runtime/python/websocket/funasr_wss_server_streaming_llm.py
+++ b/runtime/python/websocket/funasr_wss_server_streaming_llm.py
@@ -29,6 +29,7 @@ parser.add_argument("--ngpu", type=int, default=1, help="0 for cpu, 1 for gpu")
 parser.add_argument("--device", type=str, default="cuda", help="cuda, cpu")
 parser.add_argument("--ncpu", type=int, default=4, help="cpu cores")
 parser.add_argument("--return_sentence", action="store_true", help="return sentence or all_res")
+parser.add_argument("--no_vad", action="store_true", help="infer without vad")
 parser.add_argument(
     "--certfile",
     type=str,
@@ -483,12 +484,18 @@ async def ws_serve(websocket, path):
                         frames_asr.append(message)
 
                     # vad online
-                    try:
-                        speech_start_i, speech_end_i = await async_vad(websocket, message)
-                    except:
-                        print("error in vad")
-                    if speech_start_i != -1:
+                    if not args.no_vad:
+                        try:
+                            speech_start_i, speech_end_i = await async_vad(websocket, message)
+                        except:
+                            print("error in vad")
+                        if speech_start_i != -1:
+                            speech_start = True
+                            frames_asr = []
+                            frames_asr.extend(frames)
+                    else:
                         speech_start = True
+                        speech_end_i = -1
                         beg_bias = (websocket.vad_pre_idx - speech_start_i) // duration_ms
                         frames_pre = frames[-beg_bias:]
                         frames_asr = []
@@ -496,19 +503,31 @@ async def ws_serve(websocket, path):
 
                 # vad end
                 if speech_end_i != -1 or not websocket.is_speaking:
-                    audio_in = b"".join(frames_asr)
-                    try:
-                        await streaming_transcribe(
-                            websocket, audio_in, is_vad_end=True, asr_prompt=asr_prompt, s2tt_prompt=s2tt_prompt
-                        )
-                    except Exception as e:
-                        print(f"error in streaming, {e}")
-                        print(f"error in streaming, {websocket.streaming_state}")
+                    if speech_end_i != -1:
+                        audio_in = b"".join(frames_asr)
+                        try:
+                            await streaming_transcribe(
+                                websocket, audio_in, is_vad_end=True, asr_prompt=asr_prompt, s2tt_prompt=s2tt_prompt
+                            )
+                        except Exception as e:
+                            print(f"error in streaming, {e}")
+                            print(f"error in streaming, {websocket.streaming_state}")
                     frames_asr = []
                     speech_start = False
                     websocket.streaming_state["previous_asr_text"] = ""
                     websocket.streaming_state["previous_s2tt_text"] = ""
                     if not websocket.is_speaking:
+                        message = json.dumps(
+                            {
+                                "mode": "online",
+                                "asr_text": websocket.streaming_state["onscreen_asr_res"] + "<em></em>",
+                                "s2tt_text": websocket.streaming_state["onscreen_s2tt_res"] + "<em></em>",
+                                "wav_name": websocket.wav_name,
+                                "is_final": websocket.is_speaking,
+                                "is_sentence_end": True,
+                            }
+                        )
+                        await websocket.send(message)
                         await clear_websocket()
                     if args.return_sentence:
                         websocket.streaming_state["previous_vad_onscreen_asr_text"] = ""

From 68c770f67cce8e11ae8ee6ab929068e365410b31 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9C=A8=E5=AE=88?= <yangyexin.yyx@alibaba-inc.com>
Date: Wed, 11 Sep 2024 17:57:48 +0800
Subject: [PATCH 2/2] update

---
 .../websocket/funasr_wss_server_streaming_llm.py      | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/runtime/python/websocket/funasr_wss_server_streaming_llm.py b/runtime/python/websocket/funasr_wss_server_streaming_llm.py
index f7a2e7c86..42c0c3d0b 100644
--- a/runtime/python/websocket/funasr_wss_server_streaming_llm.py
+++ b/runtime/python/websocket/funasr_wss_server_streaming_llm.py
@@ -79,6 +79,7 @@ audio_encoder_dir = snapshot_download("iic/SenseVoice", cache_dir=None, revision
 # audio_encoder_dir = "/nfs/yangyexin.yyx/init_model/iic/SenseVoiceModelscope_0712"
 device = "cuda:0"
 all_file_paths = [
+    "/nfs/yangyexin.yyx/init_model/s2tt/qwen2_7b_mmt_v15_20240910_streaming",
     "FunAudioLLM/qwen2_7b_mmt_v15_20240910_streaming",
     "FunAudioLLM/qwen2_7b_mmt_v15_20240902",
     "FunAudioLLM/qwen2_7b_mmt_v14_20240830",
@@ -92,7 +93,6 @@ llm_kwargs = {"num_beams": 1, "do_sample": False, "repetition_penalty": 1.3}
 UNFIX_LEN = 5
 MIN_LEN_PER_PARAGRAPH = 25
 MIN_LEN_SEC_AUDIO_FIX = 1.1
-MAX_ITER_PER_CHUNK = 20
 
 ckpt_dir = all_file_paths[0]
 
@@ -491,15 +491,16 @@ async def ws_serve(websocket, path):
                             print("error in vad")
                         if speech_start_i != -1:
                             speech_start = True
+                            speech_end_i = -1
+                            beg_bias = (websocket.vad_pre_idx - speech_start_i) // duration_ms
+                            frames_pre = frames[-beg_bias:]
                             frames_asr = []
-                            frames_asr.extend(frames)
+                            frames_asr.extend(frames_pre)
                     else:
                         speech_start = True
                         speech_end_i = -1
-                        beg_bias = (websocket.vad_pre_idx - speech_start_i) // duration_ms
-                        frames_pre = frames[-beg_bias:]
                         frames_asr = []
-                        frames_asr.extend(frames_pre)
+                        frames_asr.extend(frames)
 
                 # vad end
                 if speech_end_i != -1 or not websocket.is_speaking: