diff --git a/README.md b/README.md index c31d6168c..7c289e05a 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,6 @@ | [**Highlights**](#highlights) | [**Installation**](#installation) | [**Docs**](https://alibaba-damo-academy.github.io/FunASR/en/index.html) -| [**Tutorial_CN**](https://github.com/alibaba-damo-academy/FunASR/wiki#funasr%E7%94%A8%E6%88%B7%E6%89%8B%E5%86%8C) | [**Papers**](https://github.com/alibaba-damo-academy/FunASR#citations) | [**Runtime**](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime) | [**Model Zoo**](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md) diff --git a/egs_modelscope/asr/TEMPLATE/README.md b/egs_modelscope/asr/TEMPLATE/README.md index 2c5433332..b938e3458 100644 --- a/egs_modelscope/asr/TEMPLATE/README.md +++ b/egs_modelscope/asr/TEMPLATE/README.md @@ -20,11 +20,13 @@ rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyu print(rec_result) ``` #### [Paraformer-online Model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary) +##### Streaming Decoding ```python inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online', - model_revision='v1.0.6', + model_revision='v1.0.4', + update_model='v1.0.4', mode='paraformer_streaming' ) import soundfile @@ -42,6 +44,23 @@ speech_chunk = speech[chunk_stride:chunk_stride+chunk_stride] rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict) print(rec_result) ``` + +##### Fake Streaming Decoding +```python +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks + +inference_pipeline = pipeline( + task=Tasks.auto_speech_recognition, + model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online', + model_revision='v1.0.6', + update_model='v1.0.6', + mode="paraformer_fake_streaming" +) +audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav' +rec_result = inference_pipeline(audio_in=audio_in) +print(rec_result) +``` Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/241) #### [UniASR Model](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary) diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py index 283f529a3..5fa98e5e3 100644 --- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py +++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py @@ -5,6 +5,7 @@ inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online', model_revision='v1.0.6', + update_model='v1.0.6', mode="paraformer_fake_streaming" ) audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav' diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py index 9d49d7dee..77f7939b8 100644 --- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py +++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py @@ -14,7 +14,8 @@ os.environ["MODELSCOPE_CACHE"] = "./" inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online', - model_revision='v1.0.6', + model_revision='v1.0.4', + update_model='v1.0.4', mode="paraformer_streaming" ) diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py index 5fa417b69..869ec0f49 100644 --- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py +++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py @@ -5,6 +5,7 @@ inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online', model_revision='v1.0.6', + update_model='v1.0.6', mode="paraformer_fake_streaming" ) audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav' diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py index d1dd441f0..45a2f96d7 100644 --- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py +++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py @@ -14,7 +14,8 @@ os.environ["MODELSCOPE_CACHE"] = "./" inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online', - model_revision='v1.0.6', + model_revision='v1.0.4', + update_model='v1.0.4', mode="paraformer_streaming" ) diff --git a/egs_modelscope/tp/TEMPLATE/README.md b/egs_modelscope/tp/TEMPLATE/README.md index 7cc85088a..3c7129f7d 100644 --- a/egs_modelscope/tp/TEMPLATE/README.md +++ b/egs_modelscope/tp/TEMPLATE/README.md @@ -11,7 +11,7 @@ from modelscope.utils.constant import Tasks inference_pipeline = pipeline( task=Tasks.speech_timestamp, model='damo/speech_timestamp_prediction-v1-16k-offline', - output_dir=None) + model_revision='v1.1.0') rec_result = inference_pipeline( audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav', diff --git a/fun_text_processing/inverse_text_normalization/id/id_unit_test.tsv b/fun_text_processing/inverse_text_normalization/id/id_unit_test.tsv index 662bf04f0..49eda5fc7 100644 --- a/fun_text_processing/inverse_text_normalization/id/id_unit_test.tsv +++ b/fun_text_processing/inverse_text_normalization/id/id_unit_test.tsv @@ -1,10 +1,10 @@ dua ribu dua puluh dua 2022 -tiga ribu 300 +tiga ribu 3000 sembilan ribu sembilan ratus sembilan puluh sembilan 9999 -seribu satu 100001 -ribu 100 +seribu satu 1001 +ribu 1000 seribu 1000 -seribu dua ratus delapan puluh sembilan 10289 +seribu dua ratus delapan puluh sembilan 1289 ribu dua ratus delapan puluh sembilan 1289 nol satu dua tiga empat lima enam tujuh delapan sembilan 01 2345-6789 empat belas 14 @@ -22,8 +22,8 @@ satu miliar 1 miliar seratus dua puluh tiga 123 ratus dua puluh tiga 123 dua puluh empat maret 24 maret -ribu tujuh puluh enam 10076 -seribu tujuh puluh enam 100076 -ribu tujuh puluh enam rupiah 10076 rupiah +ribu tujuh puluh enam 1076 +seribu tujuh puluh enam 1076 +ribu tujuh puluh enam rupiah 1076 rupiah tujuh puluh enam 76 -ditambah enam dua dua satu enam lima tiga sembilan nol enam nol lima +62 21 6539-0605 \ No newline at end of file +ditambah enam dua dua satu enam lima tiga sembilan nol enam nol lima +62 21 6539-0605 diff --git a/fun_text_processing/inverse_text_normalization/id/taggers/cardinal.py b/fun_text_processing/inverse_text_normalization/id/taggers/cardinal.py index 539acbc8c..d2f1a770e 100644 --- a/fun_text_processing/inverse_text_normalization/id/taggers/cardinal.py +++ b/fun_text_processing/inverse_text_normalization/id/taggers/cardinal.py @@ -26,11 +26,10 @@ class CardinalFst(GraphFst): graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv")) graph_hundreds = pynini.string_file(get_abs_path("data/numbers/hundreds.tsv")) graph_thousand = pynini.string_file(get_abs_path("data/numbers/thousand.tsv")) - - graph_cents = pynini.cross("seratus", "100") | pynini.cross("ratus", "100") | pynini.union(graph_hundreds, pynutil.insert("0")) + graph_hundred = pynini.cross("ratus", "") | pynini.cross("seratus", "") - graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("00")) + graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("0")) graph_hundred_component += delete_space graph_hundred_component += pynini.union( graph_teen | pynutil.insert("00"), @@ -44,8 +43,8 @@ class CardinalFst(GraphFst): (graph_ties | pynutil.insert("0")) + delete_space + ( graph_digit | pynutil.insert("0")), ) - graph_hundred_component = graph_hundred_component | graph_cents | graph_one_hundred_component - + graph_hundred_component = graph_hundred_component | graph_one_hundred_component + graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ ( pynini.closure(DAMO_DIGIT) + (DAMO_DIGIT - "0") + pynini.closure(DAMO_DIGIT) ) @@ -54,14 +53,12 @@ class CardinalFst(GraphFst): ) graph_thousand = pynini.cross("ribu", "") | pynini.cross("seribu", "") graph_one_thousand_component = pynini.union(pynini.cross("ribu", "1") | pynini.cross("seribu", "1")) - graph_thousand_cents = pynini.cross("seribu", "10") | pynini.cross("ribu","10") | pynini.union(graph_thousand, pynutil.insert("")) + graph_thousands = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + (pynutil.delete("ribu") | pynutil.delete("seribu")), pynutil.insert("000", weight=0.1), ) - graph_thousand_component = pynini.union(graph_digit + delete_space + graph_thousand, pynutil.insert("000")) - graph_thousand_component += delete_space - graph_thousands = graph_thousands | graph_thousand_cents | graph_thousand_component | graph_one_thousand_component + graph_thousands = graph_thousands | (pynutil.insert("00") + graph_one_thousand_component) graph_million = pynini.union( graph_hundred_component_at_least_one_none_zero_digit + delete_space + (pynutil.delete("juta") | pynutil.delete("sejuta")), diff --git a/funasr/runtime/html5/static/main.js b/funasr/runtime/html5/static/main.js index be57df145..9317778a9 100644 --- a/funasr/runtime/html5/static/main.js +++ b/funasr/runtime/html5/static/main.js @@ -145,7 +145,9 @@ function stop() { isRec = false; info_div.innerHTML="请等候..."; btnStop.disabled = true; - setTimeout(function(){btnStart.disabled = false;info_div.innerHTML="请点击开始";}, 3000 ); + setTimeout(function(){ + console.log("call stop ws!"); + wsconnecter.wsStop();btnStart.disabled = false;info_div.innerHTML="请点击开始";}, 3000 ); rec.stop(function(blob,duration){ console.log(blob); diff --git a/funasr/runtime/html5/static/wsconnecter.js b/funasr/runtime/html5/static/wsconnecter.js index dfa823551..676a94ae5 100644 --- a/funasr/runtime/html5/static/wsconnecter.js +++ b/funasr/runtime/html5/static/wsconnecter.js @@ -28,7 +28,11 @@ function WebSocketConnectMethod( config ) { //定义socket连接方法类 if ( 'WebSocket' in window ) { speechSokt = new WebSocket( Uri ); // 定义socket连接对象 speechSokt.onopen = function(e){onOpen(e);}; // 定义响应函数 - speechSokt.onclose = function(e){onClose(e);}; + speechSokt.onclose = function(e){ + console.log("onclose ws!"); + speechSokt.close(); + onClose(e); + }; speechSokt.onmessage = function(e){onMessage(e);}; speechSokt.onerror = function(e){onError(e);}; return 1; @@ -42,6 +46,7 @@ function WebSocketConnectMethod( config ) { //定义socket连接方法类 // 定义停止与发送函数 this.wsStop = function () { if(speechSokt != undefined) { + console.log("stop ws!"); speechSokt.close(); } }; diff --git a/funasr/runtime/python/websocket/wss_srv_asr.py b/funasr/runtime/python/websocket/wss_srv_asr.py index 6460fbf68..948619b95 100644 --- a/funasr/runtime/python/websocket/wss_srv_asr.py +++ b/funasr/runtime/python/websocket/wss_srv_asr.py @@ -58,16 +58,36 @@ inference_pipeline_asr_online = pipeline( model=args.asr_model_online, ngpu=args.ngpu, ncpu=args.ncpu, - model_revision='v1.0.6', + model_revision='v1.0.4', + update_model='v1.0.4', mode='paraformer_streaming') -print("model loaded") +print("model loaded! only support one client at the same time now!!!!") +async def ws_reset(websocket): + print("ws reset now, total num is ",len(websocket_users)) + websocket.param_dict_asr_online = {"cache": dict()} + websocket.param_dict_vad = {'in_cache': dict(), "is_final": True} + websocket.param_dict_asr_online["is_final"]=True + audio_in=b''.join(np.zeros(int(16000),dtype=np.int16)) + inference_pipeline_vad(audio_in=audio_in, param_dict=websocket.param_dict_vad) + inference_pipeline_asr_online(audio_in=audio_in, param_dict=websocket.param_dict_asr_online) + await websocket.close() + + +async def clear_websocket(): + for websocket in websocket_users: + await ws_reset(websocket) + websocket_users.clear() + + + async def ws_serve(websocket, path): frames = [] frames_asr = [] frames_asr_online = [] global websocket_users + await clear_websocket() websocket_users.add(websocket) websocket.param_dict_asr = {} websocket.param_dict_asr_online = {"cache": dict()} @@ -139,7 +159,8 @@ async def ws_serve(websocket, path): except websockets.ConnectionClosed: - print("ConnectionClosed...", websocket_users) + print("ConnectionClosed...", websocket_users,flush=True) + await ws_reset(websocket) websocket_users.remove(websocket) except websockets.InvalidState: print("InvalidState...")