Merge branch 'dev_gzf_deepspeed' of gitlab.alibaba-inc.com:zhifu.gzf/FunASR into dev_gzf_deepspeed

2025-09-15 14:48:36 +08:00 · 2024-09-11 20:30:23 +08:00 · 2024-09-11 20:30:23 +08:00 · 397977f4ae
commit 397977f4ae
parent 94a94c4247 68c770f67c
1 changed files with 37 additions and 17 deletions
--- a/runtime/python/websocket/funasr_wss_server_streaming_llm.py
+++ b/runtime/python/websocket/funasr_wss_server_streaming_llm.py
@ -29,6 +29,7 @@ parser.add_argument("--ngpu", type=int, default=1, help="0 for cpu, 1 for gpu")
 parser.add_argument("--device", type=str, default="cuda", help="cuda, cpu")
 parser.add_argument("--ncpu", type=int, default=4, help="cpu cores")
 parser.add_argument("--return_sentence", action="store_true", help="return sentence or all_res")
 parser.add_argument("--no_vad", action="store_true", help="infer without vad")
 parser.add_argument(
    "--certfile",
    type=str,
@ -78,6 +79,7 @@ audio_encoder_dir = snapshot_download("iic/SenseVoice", cache_dir=None, revision
 # audio_encoder_dir = "/nfs/yangyexin.yyx/init_model/iic/SenseVoiceModelscope_0712"
 device = "cuda:0"
 all_file_paths = [
    "/nfs/yangyexin.yyx/init_model/s2tt/qwen2_7b_mmt_v15_20240910_streaming",
    "FunAudioLLM/qwen2_7b_mmt_v15_20240910_streaming",
    "FunAudioLLM/qwen2_7b_mmt_v15_20240902",
    "FunAudioLLM/qwen2_7b_mmt_v14_20240830",
@ -91,7 +93,6 @@ llm_kwargs = {"num_beams": 1, "do_sample": False, "repetition_penalty": 1.3}
 UNFIX_LEN = 5
 MIN_LEN_PER_PARAGRAPH = 25
 MIN_LEN_SEC_AUDIO_FIX = 1.1
 MAX_ITER_PER_CHUNK = 20
 ckpt_dir = all_file_paths[0]
@ -483,19 +484,27 @@ async def ws_serve(websocket, path):
                        frames_asr.append(message)
                    # vad online
                    if not args.no_vad:
                        try:
                            speech_start_i, speech_end_i = await async_vad(websocket, message)
                        except:
                            print("error in vad")
                        if speech_start_i != -1:
                            speech_start = True
                            speech_end_i = -1
                            beg_bias = (websocket.vad_pre_idx - speech_start_i) // duration_ms
                            frames_pre = frames[-beg_bias:]
                            frames_asr = []
                            frames_asr.extend(frames_pre)
                    else:
                        speech_start = True
                        speech_end_i = -1
                        frames_asr = []
                        frames_asr.extend(frames)
                # vad end
                if speech_end_i != -1 or not websocket.is_speaking:
                    if speech_end_i != -1:
                        audio_in = b"".join(frames_asr)
                        try:
                            await streaming_transcribe(
@ -509,6 +518,17 @@ async def ws_serve(websocket, path):
                    websocket.streaming_state["previous_asr_text"] = ""
                    websocket.streaming_state["previous_s2tt_text"] = ""
                    if not websocket.is_speaking:
                        message = json.dumps(
                            {
                                "mode": "online",
                                "asr_text": websocket.streaming_state["onscreen_asr_res"] + "<em></em>",
                                "s2tt_text": websocket.streaming_state["onscreen_s2tt_res"] + "<em></em>",
                                "wav_name": websocket.wav_name,
                                "is_final": websocket.is_speaking,
                                "is_sentence_end": True,
                            }
                        )
                        await websocket.send(message)
                        await clear_websocket()
                    if args.return_sentence:
                        websocket.streaming_state["previous_vad_onscreen_asr_text"] = ""