websocket new version for offline 2pass send bytes

2025-09-15 14:48:36 +08:00 · 2023-05-13 00:20:19 +08:00 · 2023-05-13 00:20:19 +08:00 · 9dad49c3a1
commit 9dad49c3a1
parent f03a604204
10 changed files with 395 additions and 141 deletions
--- a/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/PKG-INFO
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/PKG-INFO
@ -0,0 +1,190 @@
+Metadata-Version: 2.1
+Name: funasr-onnx
+Version: 0.1.0
+Summary: FunASR: A Fundamental End-to-End Speech Recognition Toolkit
+Home-page: https://github.com/alibaba-damo-academy/FunASR.git
+Author: Speech Lab of DAMO Academy, Alibaba Group
+Author-email: funasr@list.alibaba-inc.com
+License: MIT
+Keywords: funasr,asr
+Platform: Any
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Description-Content-Type: text/markdown
+
+# ONNXRuntime-python
+
+
+## Install `funasr_onnx`
+
+install from pip
+```shell
+pip install -U funasr_onnx
+# For the users in China, you could install with the command:
+# pip install -U funasr_onnx -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
+
+or install from source code
+
+```shell
+git clone https://github.com/alibaba/FunASR.git && cd FunASR
+cd funasr/runtime/python/onnxruntime
+pip install -e ./
+# For the users in China, you could install with the command:
+# pip install -e ./ -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
+
+## Inference with runtime
+
+### Speech Recognition
+#### Paraformer
+ ```python
+from funasr_onnx import Paraformer
+from pathlib import Path
+
+model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model = Paraformer(model_dir, batch_size=1, quantize=True)
+
+wav_path = ['{}/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav'.format(Path.home())]
+
+result = model(wav_path)
+print(result)
+ ```
+- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
+- `batch_size`: `1` (Default), the batch size duration inference
+- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
+- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
+- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
+
+Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+
+Output: `List[str]`: recognition result
+
+#### Paraformer-online
+
+### Voice Activity Detection
+#### FSMN-VAD
+```python
+from funasr_onnx import Fsmn_vad
+from pathlib import Path
+
+model_dir = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+wav_path = '{}/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav'.format(Path.home())
+
+model = Fsmn_vad(model_dir)
+
+result = model(wav_path)
+print(result)
+```
+- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
+- `batch_size`: `1` (Default), the batch size duration inference
+- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
+- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
+- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
+
+Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+
+Output: `List[str]`: recognition result
+
+
+#### FSMN-VAD-online
+```python
+from funasr_onnx import Fsmn_vad_online
+import soundfile
+from pathlib import Path
+
+model_dir = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+wav_path = '{}/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav'.format(Path.home())
+
+model = Fsmn_vad_online(model_dir)
+
+
+##online vad
+speech, sample_rate = soundfile.read(wav_path)
+speech_length = speech.shape[0]
+#
+sample_offset = 0
+step = 1600
+param_dict = {'in_cache': []}
+for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
+    if sample_offset + step >= speech_length - 1:
+        step = speech_length - sample_offset
+        is_final = True
+    else:
+        is_final = False
+    param_dict['is_final'] = is_final
+    segments_result = model(audio_in=speech[sample_offset: sample_offset + step],
+                            param_dict=param_dict)
+    if segments_result:
+        print(segments_result)
+```
+- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
+- `batch_size`: `1` (Default), the batch size duration inference
+- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
+- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
+- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
+
+Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+
+Output: `List[str]`: recognition result
+
+
+### Punctuation Restoration
+#### CT-Transformer
+```python
+from funasr_onnx import CT_Transformer
+
+model_dir = "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+model = CT_Transformer(model_dir)
+
+text_in="跨境河流是养育沿岸人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切愿意进一步完善双方联合工作机制凡是中方能做的我们都会去做而且会做得更好我请印度朋友们放心中国在上游的任何开发利用都会经过科学规划和论证兼顾上下游的利益"
+result = model(text_in)
+print(result[0])
+```
+- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
+- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
+- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
+- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
+
+Input: `str`, raw text of asr result
+
+Output: `List[str]`: recognition result
+
+
+#### CT-Transformer-online
+```python
+from funasr_onnx import CT_Transformer_VadRealtime
+
+model_dir = "damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
+model = CT_Transformer_VadRealtime(model_dir)
+
+text_in  = "跨境河流是养育沿岸|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员|在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险|向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流>问题上的关切|愿意进一步完善双方联合工作机制|凡是|中方能做的我们|都会去做而且会做得更好我请印度朋友们放心中国在上游的|任何开发利用都会经过科学|规划和论证兼顾上下游的利益"
+
+vads = text_in.split("|")
+rec_result_all=""
+param_dict = {"cache": []}
+for vad in vads:
+    result = model(vad, param_dict=param_dict)
+    rec_result_all += result[0]
+
+print(rec_result_all)
+```
+- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
+- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
+- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
+- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
+
+Input: `str`, raw text of asr result
+
+Output: `List[str]`: recognition result
+
+## Performance benchmark
+
+Please ref to [benchmark](https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/runtime/python/benchmark_onnx.md)
+
+## Acknowledge
+1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
+2. We partially refer [SWHL](https://github.com/RapidAI/RapidASR) for onnxruntime (only for paraformer model).
--- a/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/SOURCES.txt
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/SOURCES.txt
@ -0,0 +1,17 @@
+README.md
+setup.py
+funasr_onnx/__init__.py
+funasr_onnx/paraformer_bin.py
+funasr_onnx/punc_bin.py
+funasr_onnx/vad_bin.py
+funasr_onnx.egg-info/PKG-INFO
+funasr_onnx.egg-info/SOURCES.txt
+funasr_onnx.egg-info/dependency_links.txt
+funasr_onnx.egg-info/requires.txt
+funasr_onnx.egg-info/top_level.txt
+funasr_onnx/utils/__init__.py
+funasr_onnx/utils/e2e_vad.py
+funasr_onnx/utils/frontend.py
+funasr_onnx/utils/postprocess_utils.py
+funasr_onnx/utils/timestamp_utils.py
+funasr_onnx/utils/utils.py
--- a/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/dependency_links.txt
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/dependency_links.txt
@ -0,0 +1 @@
+
--- a/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/requires.txt
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/requires.txt
@ -0,0 +1,10 @@
+librosa
+onnxruntime>=1.7.0
+scipy
+numpy>=1.19.3
+typeguard
+kaldi-native-fbank
+PyYAML>=5.1.2
+funasr
+modelscope
+onnx
--- a/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/top_level.txt
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/top_level.txt
@ -0,0 +1 @@
+funasr_onnx
--- a/funasr/runtime/python/websocket/ws_client.py
+++ b/funasr/runtime/python/websocket/ws_client.py
@ -85,9 +85,8 @@ async def record_microphone():
                    input=True,
                    frames_per_buffer=CHUNK)

-    message = json.dumps({"chunk_size": args.chunk_size, "chunk_interval": args.chunk_interval, "wav_name": wav_name,"is_speaking": True})
+    message = json.dumps({"chunk_size": args.chunk_size, "chunk_interval": args.chunk_interval, "wav_name": "microphone", "is_speaking": True})
    voices.put(message)
-    is_speaking = True
    while True:

        data = stream.read(CHUNK)
@ -146,9 +145,6 @@ async def record_from_scp(chunk_begin,chunk_size):
            sleep_duration = 0.001 if args.send_without_sleep else 60*args.chunk_size[1]/args.chunk_interval/1000
            await asyncio.sleep(sleep_duration)

-    is_finished = True
-    message = json.dumps({"is_finished": is_finished})
-    voices.put(message)

 async def ws_send():
    global voices
@ -241,29 +237,9 @@ def one_thread(id,chunk_begin,chunk_size):


 if __name__ == '__main__':
-    # calculate the number of wavs for each preocess
-    if args.audio_in.endswith(".scp"):
-        f_scp = open(args.audio_in)
-        wavs = f_scp.readlines()
-    else:
-        wavs = [args.audio_in]
-    total_len=len(wavs)
-    if total_len>=args.test_thread_num:
-         chunk_size=int((total_len)/args.test_thread_num)
-         remain_wavs=total_len-chunk_size*args.test_thread_num
-    else:
-         chunk_size=0
-    
    process_list = []
-    chunk_begin=0
    for i in range(args.test_thread_num):
-        now_chunk_size= chunk_size
-        if remain_wavs>0:
-            now_chunk_size=chunk_size+1
-            remain_wavs=remain_wavs-1
-        # process i handle wavs at chunk_begin and size of now_chunk_size
-        p = Process(target=one_thread,args=(i,chunk_begin,now_chunk_size))
-        chunk_begin=chunk_begin+now_chunk_size
+        p = Process(target=one_thread,args=(i, 0, 0))
        p.start()
        process_list.append(p)

@ -271,5 +247,38 @@ if __name__ == '__main__':
        p.join()

    print('end')
- 
+
+#
+# if __name__ == '__main__':
+#     # calculate the number of wavs for each preocess
+#     if args.audio_in.endswith(".scp"):
+#         f_scp = open(args.audio_in)
+#         wavs = f_scp.readlines()
+#     else:
+#         wavs = [args.audio_in]
+#     total_len=len(wavs)
+#     if total_len>=args.test_thread_num:
+#          chunk_size=int((total_len)/args.test_thread_num)
+#          remain_wavs=total_len-chunk_size*args.test_thread_num
+#     else:
+#          chunk_size=0
+#
+#     process_list = []
+#     chunk_begin=0
+#     for i in range(args.test_thread_num):
+#         now_chunk_size= chunk_size
+#         if remain_wavs>0:
+#             now_chunk_size=chunk_size+1
+#             remain_wavs=remain_wavs-1
+#         # process i handle wavs at chunk_begin and size of now_chunk_size
+#         p = Process(target=one_thread,args=(i,chunk_begin,now_chunk_size))
+#         chunk_begin=chunk_begin+now_chunk_size
+#         p.start()
+#         process_list.append(p)
+#
+#     for i in process_list:
+#         p.join()
+#
+#     print('end')
+#

--- a/funasr/runtime/python/websocket/ws_server_2pass.py
+++ b/funasr/runtime/python/websocket/ws_server_2pass.py
@ -74,47 +74,54 @@ async def ws_serve(websocket, path):
    websocket.param_dict_punc = {'cache': list()}
    websocket.vad_pre_idx = 0
    speech_start = False
+    websocket.wav_name = "microphone"
+    print("new user connected", flush=True)

    try:
        async for message in websocket:
-            message = json.loads(message)
-            is_finished = message["is_finished"]
-            if not is_finished:
-                audio = bytes(message['audio'], 'ISO-8859-1')
-                frames.append(audio)
-                duration_ms = len(audio)//32
-                websocket.vad_pre_idx += duration_ms
-
-                is_speaking = message["is_speaking"]
-                websocket.param_dict_vad["is_final"] = not is_speaking
-                websocket.param_dict_asr_online["is_final"] = not is_speaking
-                websocket.param_dict_asr_online["chunk_size"] = message["chunk_size"]
-                websocket.wav_name = message.get("wav_name", "demo")
-                # asr online
-                frames_asr_online.append(audio)
-                if len(frames_asr_online) % message["chunk_interval"] == 0:
-                    audio_in = b"".join(frames_asr_online)
-                    await async_asr_online(websocket, audio_in)
-                    frames_asr_online = []
-                if speech_start:
-                    frames_asr.append(audio)
-                # vad online
-                speech_start_i, speech_end_i = await async_vad(websocket, audio)
-                if speech_start_i:
-                    speech_start = True
-                    beg_bias = (websocket.vad_pre_idx-speech_start_i)//duration_ms
-                    frames_pre = frames[-beg_bias:]
-                    frames_asr = []
-                    frames_asr.extend(frames_pre)
+            if isinstance(message, str):
+                messagejson = json.loads(message)
+        
+                if "is_speaking" in messagejson:
+                    websocket.is_speaking = messagejson["is_speaking"]
+                    websocket.param_dict_asr_online["is_final"] = not websocket.is_speaking
+                if "chunk_interval" in messagejson:
+                    websocket.chunk_interval = messagejson["chunk_interval"]
+                if "wav_name" in messagejson:
+                    websocket.wav_name = messagejson.get("wav_name")
+                if "chunk_size" in messagejson:
+                    websocket.param_dict_asr_online["chunk_size"] = messagejson["chunk_size"]
+            if len(frames_asr_online) > 0 or len(frames_asr) > 0 or not isinstance(message, str):
+                if not isinstance(message, str):
+                    frames.append(message)
+                    duration_ms = len(message)//32
+                    websocket.vad_pre_idx += duration_ms
+        
+                    # asr online
+                    frames_asr_online.append(message)
+                    if len(frames_asr_online) % websocket.chunk_interval == 0:
+                        audio_in = b"".join(frames_asr_online)
+                        await async_asr_online(websocket, audio_in)
+                        frames_asr_online = []
+                    if speech_start:
+                        frames_asr.append(message)
+                    # vad online
+                    speech_start_i, speech_end_i = await async_vad(websocket, message)
+                    if speech_start_i:
+                        speech_start = True
+                        beg_bias = (websocket.vad_pre_idx-speech_start_i)//duration_ms
+                        frames_pre = frames[-beg_bias:]
+                        frames_asr = []
+                        frames_asr.extend(frames_pre)
                # asr punc offline
-                if speech_end_i or not is_speaking:
+                if speech_end_i or not websocket.is_speaking:
                    audio_in = b"".join(frames_asr)
                    await async_asr(websocket, audio_in)
                    frames_asr = []
                    speech_start = False
                    frames_asr_online = []
                    websocket.param_dict_asr_online = {"cache": dict()}
-                    if not is_speaking:
+                    if not websocket.is_speaking:
                        websocket.vad_pre_idx = 0
                        frames = []
                        websocket.param_dict_vad = {'in_cache': dict()}
@ -168,7 +175,7 @@ async def async_asr_online(websocket, audio_in):
        audio_in = load_bytes(audio_in)
        rec_result = inference_pipeline_asr_online(audio_in=audio_in,
                                                   param_dict=websocket.param_dict_asr_online)
-        if websocket.param_dict_asr_online["is_final"]:
+        if websocket.param_dict_asr_online.get("is_final", False):
            websocket.param_dict_asr_online["cache"] = dict()
        if "text" in rec_result:
            if rec_result["text"] != "sil" and rec_result["text"] != "waiting_for_more_voice":
--- a/funasr/runtime/python/websocket/ws_server_offline.py
+++ b/funasr/runtime/python/websocket/ws_server_offline.py
@ -65,35 +65,40 @@ async def ws_serve(websocket, path):
    websocket.param_dict_punc = {'cache': list()}
    websocket.vad_pre_idx = 0
    speech_start = False
+    websocket.wav_name = "microphone"
+    print("new user connected", flush=True)

    try:
        async for message in websocket:
-            message = json.loads(message)
-            is_finished = message["is_finished"]
-            if not is_finished:
-                audio = bytes(message['audio'], 'ISO-8859-1')
-                frames.append(audio)
-                duration_ms = len(audio)//32
-                websocket.vad_pre_idx += duration_ms
-
-                is_speaking = message["is_speaking"]
-                websocket.param_dict_vad["is_final"] = not is_speaking
-                websocket.wav_name = message.get("wav_name", "demo")
-                if speech_start:
-                    frames_asr.append(audio)
-                speech_start_i, speech_end_i = await async_vad(websocket, audio)
-                if speech_start_i:
-                    speech_start = True
-                    beg_bias = (websocket.vad_pre_idx-speech_start_i)//duration_ms
-                    frames_pre = frames[-beg_bias:]
-                    frames_asr = []
-                    frames_asr.extend(frames_pre)
-                if speech_end_i or not is_speaking:
+            if isinstance(message, str):
+                messagejson = json.loads(message)
+                if "is_speaking" in messagejson:
+                    websocket.is_speaking = messagejson["is_speaking"]
+                    websocket.param_dict_vad["is_final"] = not websocket.is_speaking
+                if "wav_name" in messagejson:
+                    websocket.wav_name = messagejson.get("wav_name")
+            
+            if len(frames_asr) > 0 or not isinstance(message, str):
+                if not isinstance(message, str):
+                    frames.append(message)
+                    duration_ms = len(message)//32
+                    websocket.vad_pre_idx += duration_ms
+    
+                    if speech_start:
+                        frames_asr.append(message)
+                    speech_start_i, speech_end_i = await async_vad(websocket, message)
+                    if speech_start_i:
+                        speech_start = True
+                        beg_bias = (websocket.vad_pre_idx-speech_start_i)//duration_ms
+                        frames_pre = frames[-beg_bias:]
+                        frames_asr = []
+                        frames_asr.extend(frames_pre)
+                if speech_end_i or not websocket.is_speaking:
                    audio_in = b"".join(frames_asr)
                    await async_asr(websocket, audio_in)
                    frames_asr = []
                    speech_start = False
-                    if not is_speaking:
+                    if not websocket.is_speaking:
                        websocket.vad_pre_idx = 0
                        frames = []
                        websocket.param_dict_vad = {'in_cache': dict()}
@ -133,7 +138,7 @@ async def async_asr(websocket, audio_in):
                
                rec_result = inference_pipeline_asr(audio_in=audio_in,
                                                    param_dict=websocket.param_dict_asr)
-                # print(rec_result)
+                print(rec_result)
                if inference_pipeline_punc is not None and 'text' in rec_result and len(rec_result["text"])>0:
                    rec_result = inference_pipeline_punc(text_in=rec_result['text'],
                                                         param_dict=websocket.param_dict_punc)
--- a/funasr/runtime/python/websocket/ws_server_online.py
+++ b/funasr/runtime/python/websocket/ws_server_online.py
@ -26,74 +26,72 @@ websocket_users = set()
 print("model loading")

 inference_pipeline_asr_online = pipeline(
-    task=Tasks.auto_speech_recognition,
-    model=args.asr_model_online,
-    ngpu=args.ngpu,
-    ncpu=args.ncpu,
-    model_revision='v1.0.4')
+	task=Tasks.auto_speech_recognition,
+	model=args.asr_model_online,
+	ngpu=args.ngpu,
+	ncpu=args.ncpu,
+	model_revision='v1.0.4')

 print("model loaded")



 async def ws_serve(websocket, path):
-    frames_asr_online = []
-    global websocket_users
-    websocket_users.add(websocket)
-    websocket.param_dict_asr_online = {"cache": dict()}
-    print("new user connected",flush=True)
-    try:
-        async for message in websocket:
-            
- 
-            if isinstance(message,str):
-              messagejson = json.loads(message)
-               
-              if "is_speaking" in messagejson:
-                  websocket.is_speaking = messagejson["is_speaking"]  
-                  websocket.param_dict_asr_online["is_final"] = not websocket.is_speaking
-              if "is_finished" in messagejson:
-                  websocket.is_speaking = False
-                  websocket.param_dict_asr_online["is_final"] = True
-              if "chunk_interval" in messagejson:
-                  websocket.chunk_interval=messagejson["chunk_interval"]
-              if "wav_name" in messagejson:
-                  websocket.wav_name = messagejson.get("wav_name", "demo")
-              if "chunk_size" in messagejson:
-                  websocket.param_dict_asr_online["chunk_size"] = messagejson["chunk_size"]
-            # if has bytes in buffer or message is bytes
-            if len(frames_asr_online)>0 or not isinstance(message,str):
-               if not isinstance(message,str):
-                 frames_asr_online.append(message)
-               if len(frames_asr_online) % websocket.chunk_interval == 0 or not websocket.is_speaking:
-                    audio_in = b"".join(frames_asr_online)
-                    if not websocket.is_speaking:
-                       #padding 0.5s at end gurantee that asr engine can fire out last word
-                       audio_in=audio_in+b''.join(np.zeros(int(16000*0.5),dtype=np.int16))
-                    await async_asr_online(websocket,audio_in)
-                    frames_asr_online = []
+	frames_asr_online = []
+	global websocket_users
+	websocket_users.add(websocket)
+	websocket.param_dict_asr_online = {"cache": dict()}
+	websocket.wav_name = "microphone"
+	print("new user connected",flush=True)
+	try:
+		async for message in websocket:
+			
+			
+			if isinstance(message, str):
+				messagejson = json.loads(message)
+				
+				if "is_speaking" in messagejson:
+					websocket.is_speaking = messagejson["is_speaking"]
+					websocket.param_dict_asr_online["is_final"] = not websocket.is_speaking
+				if "chunk_interval" in messagejson:
+					websocket.chunk_interval=messagejson["chunk_interval"]
+				if "wav_name" in messagejson:
+					websocket.wav_name = messagejson.get("wav_name")
+				if "chunk_size" in messagejson:
+					websocket.param_dict_asr_online["chunk_size"] = messagejson["chunk_size"]
+			# if has bytes in buffer or message is bytes
+			if len(frames_asr_online) > 0 or not isinstance(message, str):
+				if not isinstance(message,str):
+					frames_asr_online.append(message)
+				if len(frames_asr_online) % websocket.chunk_interval == 0 or not websocket.is_speaking:
+					audio_in = b"".join(frames_asr_online)
+					# if not websocket.is_speaking:
+						#padding 0.5s at end gurantee that asr engine can fire out last word
+						# audio_in=audio_in+b''.join(np.zeros(int(16000*0.5),dtype=np.int16))
+					await async_asr_online(websocket,audio_in)
+					frames_asr_online = []
+	
+	
+	except websockets.ConnectionClosed:
+		print("ConnectionClosed...", websocket_users)
+		websocket_users.remove(websocket)
+	except websockets.InvalidState:
+		print("InvalidState...")
+	except Exception as e:
+		print("Exception:", e)

-     
-    except websockets.ConnectionClosed:
-        print("ConnectionClosed...", websocket_users)
-        websocket_users.remove(websocket)
-    except websockets.InvalidState:
-        print("InvalidState...")
-    except Exception as e:
-        print("Exception:", e)

- 
 async def async_asr_online(websocket,audio_in):
-            if len(audio_in) > 0:
-                audio_in = load_bytes(audio_in)
-                rec_result = inference_pipeline_asr_online(audio_in=audio_in,
-                                                           param_dict=websocket.param_dict_asr_online)
-                if websocket.param_dict_asr_online["is_final"]:
-                    websocket.param_dict_asr_online["cache"] = dict()
-                if "text" in rec_result:
-                    if rec_result["text"] != "sil" and rec_result["text"] != "waiting_for_more_voice":
-                        message = json.dumps({"mode": "online", "text": rec_result["text"], "wav_name": websocket.wav_name})
-                        await websocket.send(message)
+	if len(audio_in) > 0:
+		audio_in = load_bytes(audio_in)
+		rec_result = inference_pipeline_asr_online(audio_in=audio_in,
+		                                           param_dict=websocket.param_dict_asr_online)
+		if websocket.param_dict_asr_online.get("is_final", False):
+			websocket.param_dict_asr_online["cache"] = dict()
+		if "text" in rec_result:
+			if rec_result["text"] != "sil" and rec_result["text"] != "waiting_for_more_voice":
+				message = json.dumps({"mode": "online", "text": rec_result["text"], "wav_name": websocket.wav_name})
+				await websocket.send(message)



--- a/funasr/utils/modelscope_utils.py
+++ b/funasr/utils/modelscope_utils.py
@ -0,0 +1,16 @@
+import os
+from modelscope.hub.snapshot_download import snapshot_download
+
+
+def check_model_dir(model_dir, model_name: str = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"):
+	model_dir = "/Users/zhifu/test_modelscope_pipeline/FSMN-VAD"
+	
+	cache_root = os.path.dirname(model_dir)
+	dst_dir_root = os.path.join(cache_root, ".cache")
+	dst = os.path.join(dst_dir_root, model_name)
+	dst_dir = os.path.dirname(dst)
+	os.makedirs(dst_dir, exist_ok=True)
+	if not os.path.exists(dst):
+		os.symlink(model_dir, dst)
+	
+	model_dir = snapshot_download(model_name, cache_dir=dst_dir_root)