From 4798614e683e94c1b8ed7b514fbc47381f6465ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= Date: Thu, 23 Mar 2023 15:41:57 +0800 Subject: [PATCH 1/3] websocket --- funasr/runtime/python/websocket/ASR_server.py | 68 +++++++++++-------- 1 file changed, 38 insertions(+), 30 deletions(-) diff --git a/funasr/runtime/python/websocket/ASR_server.py b/funasr/runtime/python/websocket/ASR_server.py index 0af520880..0aed2b498 100644 --- a/funasr/runtime/python/websocket/ASR_server.py +++ b/funasr/runtime/python/websocket/ASR_server.py @@ -85,10 +85,15 @@ def vad(data): # 推理 #print(type(data)) segments_result = vad_pipline(audio_in=data) #print(segments_result) - if len(segments_result) == 0: + speech_start = False + speech_end = False + if len(segments_result) == 0 or len(segments_result["text"] > 1): return False - else: - return True + elif segments_result["text"][0][0] != -1: + speech_start = True + elif segments_result["text"][0][1] != -1: + speech_end = True + return speech_start, speech_end def asr(): # 推理 global inference_pipeline2 @@ -106,11 +111,12 @@ def asr(): # 推理 def main(): # 推理 frames = [] # 存储所有的帧数据 buffer = [] # 存储缓存中的帧数据(最多两个片段) - silence_count = 0 # 统计连续静音的次数 - speech_detected = False # 标记是否检测到语音 + # silence_count = 0 # 统计连续静音的次数 + # speech_detected = False # 标记是否检测到语音 RECORD_NUM = 0 global voices global speek + speech_start, speech_end = False, False while True: while not voices.empty(): @@ -121,32 +127,34 @@ def main(): # 推理 if len(buffer) > 2: buffer.pop(0) # 如果缓存超过两个片段,则删除最早的一个 - if speech_detected: + if speech_start: frames.append(data) - RECORD_NUM += 1 - - if vad(data): - if not speech_detected: - print("检测到人声...") - speech_detected = True # 标记为检测到语音 - frames = [] - frames.extend(buffer) # 把之前2个语音数据快加入 - silence_count = 0 # 重置静音次数 - else: - silence_count += 1 # 增加静音次数 - - if speech_detected and (silence_count > 4 or RECORD_NUM > 50): #这里 50 可根据需求改为合适的数据快数量 - print("说话结束或者超过设置最长时间...") - audio_in = b"".join(frames) - #asrt = threading.Thread(target=asr,args=(audio_in,)) - #asrt.start() - speek.put(audio_in) - #rec_result = inference_pipeline2(audio_in=audio_in) # ASR 模型里跑一跑 - frames = [] # 清空所有的帧数据 - buffer = [] # 清空缓存中的帧数据(最多两个片段) - silence_count = 0 # 统计连续静音的次数清零 - speech_detected = False # 标记是否检测到语音 - RECORD_NUM = 0 + RECORD_NUM += 1 + speech_start_i, speech_end_i = vad(data) + if speech_start_i: + speech_start = speech_start_i + # if not speech_detected: + print("检测到人声...") + # speech_detected = True # 标记为检测到语音 + frames = [] + frames.extend(buffer) # 把之前2个语音数据快加入 + # silence_count = 0 # 重置静音次数 + elif speech_end_i or RECORD_NUM > 300: + # silence_count += 1 # 增加静音次数 + # speech_end = speech_end_i + speech_start = False + # if RECORD_NUM > 300: #这里 50 可根据需求改为合适的数据快数量 + print("说话结束或者超过设置最长时间...") + audio_in = b"".join(frames) + #asrt = threading.Thread(target=asr,args=(audio_in,)) + #asrt.start() + speek.put(audio_in) + #rec_result = inference_pipeline2(audio_in=audio_in) # ASR 模型里跑一跑 + frames = [] # 清空所有的帧数据 + buffer = [] # 清空缓存中的帧数据(最多两个片段) + # silence_count = 0 # 统计连续静音的次数清零 + # speech_detected = False # 标记是否检测到语音 + RECORD_NUM = 0 time.sleep(0.01) time.sleep(0.01) From d781aa27e16e25ec04b66bab17b735e56005b3b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= Date: Thu, 23 Mar 2023 15:47:04 +0800 Subject: [PATCH 2/3] websocket --- funasr/runtime/python/websocket/ASR_server.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/funasr/runtime/python/websocket/ASR_server.py b/funasr/runtime/python/websocket/ASR_server.py index 0aed2b498..ac63d3c91 100644 --- a/funasr/runtime/python/websocket/ASR_server.py +++ b/funasr/runtime/python/websocket/ASR_server.py @@ -38,6 +38,10 @@ parser.add_argument("--punc_model", type=str, default="", help="model from modelscope") +parser.add_argument("--ngpu", + type=int, + default=1, + help="0 for cpu, 1 for gpu") args = parser.parse_args() From 7a8bedace2ccccaa162b335cad3f3f00acf84b4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= Date: Thu, 23 Mar 2023 17:12:02 +0800 Subject: [PATCH 3/3] websocket --- funasr/bin/vad_inference_online.py | 7 ---- funasr/runtime/python/websocket/ASR_server.py | 40 ++++++++----------- 2 files changed, 17 insertions(+), 30 deletions(-) diff --git a/funasr/bin/vad_inference_online.py b/funasr/bin/vad_inference_online.py index faee1fc00..dadfd8cac 100644 --- a/funasr/bin/vad_inference_online.py +++ b/funasr/bin/vad_inference_online.py @@ -30,14 +30,7 @@ from funasr.models.frontend.wav_frontend import WavFrontendOnline from funasr.models.frontend.wav_frontend import WavFrontend from funasr.bin.vad_inference import Speech2VadSegment -header_colors = '\033[95m' -end_colors = '\033[0m' -global_asr_language: str = 'zh-cn' -global_sample_rate: Union[int, Dict[Any, int]] = { - 'audio_fs': 16000, - 'model_fs': 16000 -} class Speech2VadSegmentOnline(Speech2VadSegment): diff --git a/funasr/runtime/python/websocket/ASR_server.py b/funasr/runtime/python/websocket/ASR_server.py index ac63d3c91..cfa9a4224 100644 --- a/funasr/runtime/python/websocket/ASR_server.py +++ b/funasr/runtime/python/websocket/ASR_server.py @@ -56,7 +56,9 @@ vad_pipline = pipeline( model_revision="v1.2.0", output_dir=None, batch_size=1, + mode='online' ) +param_dict_vad = {'in_cache': dict(), "is_final": False} # 创建一个ASR对象 param_dict = dict() @@ -85,17 +87,20 @@ start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["b def vad(data): # 推理 - global vad_pipline + global vad_pipline, param_dict_vad #print(type(data)) - segments_result = vad_pipline(audio_in=data) - #print(segments_result) + # print(param_dict_vad) + segments_result = vad_pipline(audio_in=data, param_dict=param_dict_vad) + # print(segments_result) + # print(param_dict_vad) speech_start = False speech_end = False - if len(segments_result) == 0 or len(segments_result["text"] > 1): - return False - elif segments_result["text"][0][0] != -1: + + if len(segments_result) == 0 or len(segments_result["text"]) > 1: + return speech_start, speech_end + if segments_result["text"][0][0] != -1: speech_start = True - elif segments_result["text"][0][1] != -1: + if segments_result["text"][0][1] != -1: speech_end = True return speech_start, speech_end @@ -135,20 +140,21 @@ def main(): # 推理 frames.append(data) RECORD_NUM += 1 speech_start_i, speech_end_i = vad(data) + # print(speech_start_i, speech_end_i) if speech_start_i: speech_start = speech_start_i # if not speech_detected: - print("检测到人声...") + # print("检测到人声...") # speech_detected = True # 标记为检测到语音 frames = [] frames.extend(buffer) # 把之前2个语音数据快加入 # silence_count = 0 # 重置静音次数 - elif speech_end_i or RECORD_NUM > 300: + if speech_end_i or RECORD_NUM > 300: # silence_count += 1 # 增加静音次数 # speech_end = speech_end_i speech_start = False # if RECORD_NUM > 300: #这里 50 可根据需求改为合适的数据快数量 - print("说话结束或者超过设置最长时间...") + # print("说话结束或者超过设置最长时间...") audio_in = b"".join(frames) #asrt = threading.Thread(target=asr,args=(audio_in,)) #asrt.start() @@ -170,16 +176,4 @@ s = threading.Thread(target=asr) s.start() asyncio.get_event_loop().run_until_complete(start_server) -asyncio.get_event_loop().run_forever() - - - - - - - - - - - - +asyncio.get_event_loop().run_forever() \ No newline at end of file