From 0f3571db844432bba5ee9fcfb0260c6bdd1e5a6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=B8=B8=E9=9B=81?= Date: Tue, 16 May 2023 09:40:10 +0800 Subject: [PATCH] inference --- .../finetune.py | 35 - .../infer.py | 13 - .../README.md | 30 - .../finetune.py | 35 - .../infer.py | 14 - .../finetune.py | 35 - .../infer.py | 13 - .../finetune.py | 35 - .../infer.py | 13 - .../finetune.py | 35 - .../infer.py | 13 - .../README.md | 53 - .../finetune.py | 36 - .../infer.py | 89 -- .../infer_after_finetune.py | 54 - .../finetune.py | 35 - .../infer.py | 13 - .../finetune.py | 35 - .../infer.py | 13 - .../finetune.py | 35 - .../infer.py | 13 - .../finetune.py | 35 - .../infer.py | 13 - .../finetune.py | 35 - .../infer.py | 13 - .../finetune.py | 35 - .../infer.py | 13 - .../finetune.py | 35 - .../infer.py | 13 - .../README.md | 30 - .../finetune.py | 35 - .../infer.py | 14 - .../README.md | 53 - .../finetune.py | 36 - .../infer.py | 87 -- .../infer_after_finetune.py | 104 -- .../README.md | 30 - .../finetune.py | 35 - .../infer.py | 14 - .../demo.py | 10 +- .../{infer.py => demo.py} | 0 .../{infer.py => demo.py} | 0 funasr/bin/asr_infer.py | 6 +- funasr/bin/asr_inference.py | 592 ---------- funasr/bin/asr_inference_launch.py | 2 +- funasr/bin/asr_inference_mfcca.py | 767 ------------ funasr/bin/asr_inference_paraformer.py | 1027 ----------------- .../bin/asr_inference_paraformer_streaming.py | 749 ------------ funasr/bin/asr_inference_rnnt.py | 734 ------------ funasr/bin/asr_inference_uniasr.py | 694 ----------- funasr/bin/diar_infer.py | 346 ++++++ funasr/bin/diar_inference_launch.py | 358 +++++- funasr/bin/eend_ola_inference.py | 429 ------- funasr/bin/modelscope_infer.py | 90 -- ...ion_infer_vadrealtime.py => punc_infer.py} | 304 +++-- funasr/bin/punc_inference_launch.py | 170 ++- funasr/bin/punctuation_infer.py | 320 ----- funasr/bin/sond_inference.py | 577 --------- funasr/bin/sv_infer.py | 163 +++ funasr/bin/sv_inference.py | 443 ------- funasr/bin/sv_inference_launch.py | 164 ++- funasr/bin/tp_infer.py | 115 ++ funasr/bin/tp_inference.py | 399 ------- funasr/bin/tp_inference_launch.py | 172 ++- funasr/bin/vad_infer.py | 196 ++++ funasr/bin/vad_inference.py | 570 --------- funasr/bin/vad_inference_launch.py | 259 ++++- funasr/bin/vad_inference_online.py | 344 ------ funasr/version.txt | 2 +- 69 files changed, 2060 insertions(+), 9219 deletions(-) delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/README.md delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/infer.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/README.md delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/infer.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/README.md delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/finetune.py delete mode 100644 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/infer.py rename egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/{infer.py => demo.py} (100%) rename egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/{infer.py => demo.py} (100%) delete mode 100644 funasr/bin/asr_inference.py delete mode 100644 funasr/bin/asr_inference_mfcca.py delete mode 100644 funasr/bin/asr_inference_paraformer.py delete mode 100644 funasr/bin/asr_inference_paraformer_streaming.py delete mode 100644 funasr/bin/asr_inference_rnnt.py delete mode 100644 funasr/bin/asr_inference_uniasr.py create mode 100755 funasr/bin/diar_infer.py delete mode 100755 funasr/bin/eend_ola_inference.py delete mode 100755 funasr/bin/modelscope_infer.py rename funasr/bin/{punctuation_infer_vadrealtime.py => punc_infer.py} (53%) delete mode 100644 funasr/bin/punctuation_infer.py delete mode 100755 funasr/bin/sond_inference.py create mode 100755 funasr/bin/sv_infer.py delete mode 100755 funasr/bin/sv_inference.py create mode 100644 funasr/bin/tp_infer.py delete mode 100644 funasr/bin/tp_inference.py create mode 100644 funasr/bin/vad_infer.py delete mode 100644 funasr/bin/vad_inference.py delete mode 100644 funasr/bin/vad_inference_online.py diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/finetune.py deleted file mode 100644 index 3fa3f9d26..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/finetune.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer -from funasr.datasets.ms_dataset import MsDataset - - -def modelscope_finetune(params): - if not os.path.exists(params["output_dir"]): - os.makedirs(params["output_dir"], exist_ok=True) - # dataset split ["train", "validation"] - ds_dict = MsDataset.load(params["data_dir"]) - kwargs = dict( - model=params["model"], - model_revision=params["model_revision"], - data_dir=ds_dict, - dataset_type=params["dataset_type"], - work_dir=params["output_dir"], - batch_bins=params["batch_bins"], - max_epoch=params["max_epoch"], - lr=params["lr"]) - trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) - trainer.train() - - -if __name__ == '__main__': - params = {} - params["output_dir"] = "./checkpoint" - params["data_dir"] = "./data" - params["batch_bins"] = 2000 - params["dataset_type"] = "small" - params["max_epoch"] = 50 - params["lr"] = 0.00005 - params["model"] = "damo/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online" - params["model_revision"] = None - modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py deleted file mode 100644 index 862f88198..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py +++ /dev/null @@ -1,13 +0,0 @@ -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -if __name__ == "__main__": - audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cantonese-CHS.wav" - output_dir = "./results" - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online", - output_dir=output_dir, - ) - rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"}) - print(rec_result) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/README.md deleted file mode 100644 index c68a8cd4f..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/README.md +++ /dev/null @@ -1,30 +0,0 @@ -# ModelScope Model - -## How to finetune and infer using a pretrained Paraformer-large Model - -### Finetune - -- Modify finetune training related parameters in `finetune.py` - - output_dir: # result dir - - data_dir: # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text. - - batch_bins: # batch size - - max_epoch: # number of training epoch - - lr: # learning rate - -- Then you can run the pipeline to finetune with: -```python - python finetune.py -``` - -### Inference - -Or you can use the finetuned model for inference directly. - -- Setting parameters in `infer.py` - - audio_in: # support wav, url, bytes, and parsed audio format. - - output_dir: # If the input format is wav.scp, it needs to be set. - -- Then you can run the pipeline to infer with: -```python - python infer.py -``` diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/finetune.py deleted file mode 100644 index f15e3b968..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/finetune.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer -from funasr.datasets.ms_dataset import MsDataset - - -def modelscope_finetune(params): - if not os.path.exists(params.output_dir): - os.makedirs(params.output_dir, exist_ok=True) - # dataset split ["train", "validation"] - ds_dict = MsDataset.load(params.data_path) - kwargs = dict( - model=params.model, - model_revision=params.model_revision, - data_dir=ds_dict, - dataset_type=params.dataset_type, - work_dir=params.output_dir, - batch_bins=params.batch_bins, - max_epoch=params.max_epoch, - lr=params.lr) - trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) - trainer.train() - - -if __name__ == '__main__': - from funasr.utils.modelscope_param import modelscope_args - params = modelscope_args(model="speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline", data_path="./data") - params.output_dir = "./checkpoint" # m模型保存路径 - params.data_path = "./example_data/" # 数据路径 - params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large - params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒, - params.max_epoch = 20 # 最大训练轮数 - params.lr = 0.00005 # 设置学习率 - - modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/infer.py deleted file mode 100644 index 347d31694..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/infer.py +++ /dev/null @@ -1,14 +0,0 @@ -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -if __name__ == '__main__': - audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav' - output_dir = None - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline", - output_dir=output_dir, - ) - rec_result = inference_pipeline(audio_in=audio_in) - print(rec_result) - diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/finetune.py deleted file mode 100644 index 68d7ba81e..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/finetune.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer -from funasr.datasets.ms_dataset import MsDataset - - -def modelscope_finetune(params): - if not os.path.exists(params["output_dir"]): - os.makedirs(params["output_dir"], exist_ok=True) - # dataset split ["train", "validation"] - ds_dict = MsDataset.load(params["data_dir"]) - kwargs = dict( - model=params["model"], - model_revision=params["model_revision"], - data_dir=ds_dict, - dataset_type=params["dataset_type"], - work_dir=params["output_dir"], - batch_bins=params["batch_bins"], - max_epoch=params["max_epoch"], - lr=params["lr"]) - trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) - trainer.train() - - -if __name__ == '__main__': - params = {} - params["output_dir"] = "./checkpoint" - params["data_dir"] = "./data" - params["batch_bins"] = 2000 - params["dataset_type"] = "small" - params["max_epoch"] = 50 - params["lr"] = 0.00005 - params["model"] = "damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline" - params["model_revision"] = None - modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py deleted file mode 100644 index f82c1f4c4..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py +++ /dev/null @@ -1,13 +0,0 @@ -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -if __name__ == "__main__": - audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_de.wav" - output_dir = "./results" - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline", - output_dir=output_dir, - ) - rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"}) - print(rec_result) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/finetune.py deleted file mode 100644 index 397b7ffcf..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/finetune.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer -from funasr.datasets.ms_dataset import MsDataset - - -def modelscope_finetune(params): - if not os.path.exists(params["output_dir"]): - os.makedirs(params["output_dir"], exist_ok=True) - # dataset split ["train", "validation"] - ds_dict = MsDataset.load(params["data_dir"]) - kwargs = dict( - model=params["model"], - model_revision=params["model_revision"], - data_dir=ds_dict, - dataset_type=params["dataset_type"], - work_dir=params["output_dir"], - batch_bins=params["batch_bins"], - max_epoch=params["max_epoch"], - lr=params["lr"]) - trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) - trainer.train() - - -if __name__ == '__main__': - params = {} - params["output_dir"] = "./checkpoint" - params["data_dir"] = "./data" - params["batch_bins"] = 2000 - params["dataset_type"] = "small" - params["max_epoch"] = 50 - params["lr"] = 0.00005 - params["model"] = "damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline" - params["model_revision"] = None - modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py deleted file mode 100644 index 98f31b602..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py +++ /dev/null @@ -1,13 +0,0 @@ -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -if __name__ == "__main__": - audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav" - output_dir = "./results" - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline", - output_dir=output_dir, - ) - rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"}) - print(rec_result) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/finetune.py deleted file mode 100644 index 3846ff620..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/finetune.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer -from funasr.datasets.ms_dataset import MsDataset - - -def modelscope_finetune(params): - if not os.path.exists(params["output_dir"]): - os.makedirs(params["output_dir"], exist_ok=True) - # dataset split ["train", "validation"] - ds_dict = MsDataset.load(params["data_dir"]) - kwargs = dict( - model=params["model"], - model_revision=params["model_revision"], - data_dir=ds_dict, - dataset_type=params["dataset_type"], - work_dir=params["output_dir"], - batch_bins=params["batch_bins"], - max_epoch=params["max_epoch"], - lr=params["lr"]) - trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) - trainer.train() - - -if __name__ == '__main__': - params = {} - params["output_dir"] = "./checkpoint" - params["data_dir"] = "./data" - params["batch_bins"] = 2000 - params["dataset_type"] = "small" - params["max_epoch"] = 50 - params["lr"] = 0.00005 - params["model"] = "damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline" - params["model_revision"] = None - modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py deleted file mode 100644 index 75e22a0e9..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py +++ /dev/null @@ -1,13 +0,0 @@ -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -if __name__ == "__main__": - audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_es.wav" - output_dir = "./results" - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline", - output_dir=output_dir, - ) - rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"}) - print(rec_result) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md deleted file mode 100644 index b68f1e921..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md +++ /dev/null @@ -1,53 +0,0 @@ -# ModelScope Model - -## How to finetune and infer using a pretrained Paraformer-large Model - -### Finetune - -- Modify finetune training related parameters in `finetune.py` - - output_dir: # result dir - - data_dir: # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text` - - dataset_type: # for dataset larger than 1000 hours, set as `large`, otherwise set as `small` - - batch_bins: # batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms - - max_epoch: # number of training epoch - - lr: # learning rate - -- Then you can run the pipeline to finetune with: -```python - python finetune.py -``` - -### Inference - -Or you can use the finetuned model for inference directly. - -- Setting parameters in `infer.py` - - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed - - output_dir: # result dir - - ngpu: # the number of GPUs for decoding - - njob: # the number of jobs for each GPU - -- Then you can run the pipeline to infer with: -```python - python infer.py -``` - -- Results - -The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set. - -### Inference using local finetuned model - -- Modify inference related parameters in `infer_after_finetune.py` - - output_dir: # result dir - - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed - - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb` - -- Then you can run the pipeline to finetune with: -```python - python infer_after_finetune.py -``` - -- Results - -The decoding results can be found in `$output_dir/decoding_results/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set. diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/finetune.py deleted file mode 100644 index 2ecc22917..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/finetune.py +++ /dev/null @@ -1,36 +0,0 @@ -import os - -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer - -from funasr.datasets.ms_dataset import MsDataset -from funasr.utils.modelscope_param import modelscope_args - - -def modelscope_finetune(params): - if not os.path.exists(params.output_dir): - os.makedirs(params.output_dir, exist_ok=True) - # dataset split ["train", "validation"] - ds_dict = MsDataset.load(params.data_path) - kwargs = dict( - model=params.model, - data_dir=ds_dict, - dataset_type=params.dataset_type, - work_dir=params.output_dir, - batch_bins=params.batch_bins, - max_epoch=params.max_epoch, - lr=params.lr) - trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) - trainer.train() - - -if __name__ == '__main__': - params = modelscope_args(model="damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline", data_path="./data") - params.output_dir = "./checkpoint" # m模型保存路径 - params.data_path = "./example_data/" # 数据路径 - params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large - params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒, - params.max_epoch = 20 # 最大训练轮数 - params.lr = 0.00005 # 设置学习率 - - modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py deleted file mode 100644 index e6c39c2b8..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py +++ /dev/null @@ -1,89 +0,0 @@ -import os -import shutil -from multiprocessing import Pool - -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -from funasr.utils.compute_wer import compute_wer - - -def modelscope_infer_core(output_dir, split_dir, njob, idx): - output_dir_job = os.path.join(output_dir, "output.{}".format(idx)) - gpu_id = (int(idx) - 1) // njob - if "CUDA_VISIBLE_DEVICES" in os.environ.keys(): - gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",") - os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id]) - else: - os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline", - output_dir=output_dir_job, - batch_size=1 - ) - audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx)) - inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"}) - - -def modelscope_infer(params): - # prepare for multi-GPU decoding - ngpu = params["ngpu"] - njob = params["njob"] - output_dir = params["output_dir"] - if os.path.exists(output_dir): - shutil.rmtree(output_dir) - os.mkdir(output_dir) - split_dir = os.path.join(output_dir, "split") - os.mkdir(split_dir) - nj = ngpu * njob - wav_scp_file = os.path.join(params["data_dir"], "wav.scp") - with open(wav_scp_file) as f: - lines = f.readlines() - num_lines = len(lines) - num_job_lines = num_lines // nj - start = 0 - for i in range(nj): - end = start + num_job_lines - file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1))) - with open(file, "w") as f: - if i == nj - 1: - f.writelines(lines[start:]) - else: - f.writelines(lines[start:end]) - start = end - - p = Pool(nj) - for i in range(nj): - p.apply_async(modelscope_infer_core, - args=(output_dir, split_dir, njob, str(i + 1))) - p.close() - p.join() - - # combine decoding results - best_recog_path = os.path.join(output_dir, "1best_recog") - os.mkdir(best_recog_path) - files = ["text", "token", "score"] - for file in files: - with open(os.path.join(best_recog_path, file), "w") as f: - for i in range(nj): - job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file) - with open(job_file) as f_job: - lines = f_job.readlines() - f.writelines(lines) - - # If text exists, compute CER - text_in = os.path.join(params["data_dir"], "text") - if os.path.exists(text_in): - text_proc_file = os.path.join(best_recog_path, "token") - compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer")) - os.system("tail -n 3 {}".format(os.path.join(best_recog_path, "text.cer"))) - - -if __name__ == "__main__": - params = {} - params["data_dir"] = "./data/test" - params["output_dir"] = "./results" - params["ngpu"] = 1 - params["njob"] = 8 - modelscope_infer(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py deleted file mode 100644 index 6593f4e3f..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py +++ /dev/null @@ -1,54 +0,0 @@ -import json -import os -import shutil - -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -from funasr.utils.compute_wer import compute_wer - - -def modelscope_infer_after_finetune(params): - # prepare for decoding - pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"]) - for file_name in params["required_files"]: - if file_name == "configuration.json": - with open(os.path.join(pretrained_model_path, file_name)) as f: - config_dict = json.load(f) - config_dict["model"]["am_model_name"] = params["decoding_model_name"] - with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f: - json.dump(config_dict, f, indent=4, separators=(',', ': ')) - else: - shutil.copy(os.path.join(pretrained_model_path, file_name), - os.path.join(params["output_dir"], file_name)) - decoding_path = os.path.join(params["output_dir"], "decode_results") - if os.path.exists(decoding_path): - shutil.rmtree(decoding_path) - os.mkdir(decoding_path) - - # decoding - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model=params["output_dir"], - output_dir=decoding_path, - batch_size=1 - ) - audio_in = os.path.join(params["data_dir"], "wav.scp") - inference_pipeline(audio_in=audio_in) - - # computer CER if GT text is set - text_in = os.path.join(params["data_dir"], "text") - if os.path.exists(text_in): - text_proc_file = os.path.join(decoding_path, "1best_recog/token") - compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer")) - os.system("tail -n 3 {}".format(os.path.join(decoding_path, "text.cer"))) - - -if __name__ == '__main__': - params = {} - params["modelscope_model_name"] = "damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline" - params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"] - params["output_dir"] = "./checkpoint" - params["data_dir"] = "./data/test" - params["decoding_model_name"] = "20epoch.pb" - modelscope_infer_after_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/finetune.py deleted file mode 100644 index 4746cc2da..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/finetune.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer -from funasr.datasets.ms_dataset import MsDataset - - -def modelscope_finetune(params): - if not os.path.exists(params["output_dir"]): - os.makedirs(params["output_dir"], exist_ok=True) - # dataset split ["train", "validation"] - ds_dict = MsDataset.load(params["data_dir"]) - kwargs = dict( - model=params["model"], - model_revision=params["model_revision"], - data_dir=ds_dict, - dataset_type=params["dataset_type"], - work_dir=params["output_dir"], - batch_bins=params["batch_bins"], - max_epoch=params["max_epoch"], - lr=params["lr"]) - trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) - trainer.train() - - -if __name__ == '__main__': - params = {} - params["output_dir"] = "./checkpoint" - params["data_dir"] = "./data" - params["batch_bins"] = 2000 - params["dataset_type"] = "small" - params["max_epoch"] = 50 - params["lr"] = 0.00005 - params["model"] = "damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline" - params["model_revision"] = None - modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py deleted file mode 100644 index 627d132fc..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py +++ /dev/null @@ -1,13 +0,0 @@ -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -if __name__ == "__main__": - audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_fr.wav" - output_dir = "./results" - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline", - output_dir=output_dir, - ) - rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"}) - print(rec_result) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/finetune.py deleted file mode 100644 index 985b838ab..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/finetune.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer -from funasr.datasets.ms_dataset import MsDataset - - -def modelscope_finetune(params): - if not os.path.exists(params["output_dir"]): - os.makedirs(params["output_dir"], exist_ok=True) - # dataset split ["train", "validation"] - ds_dict = MsDataset.load(params["data_dir"]) - kwargs = dict( - model=params["model"], - model_revision=params["model_revision"], - data_dir=ds_dict, - dataset_type=params["dataset_type"], - work_dir=params["output_dir"], - batch_bins=params["batch_bins"], - max_epoch=params["max_epoch"], - lr=params["lr"]) - trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) - trainer.train() - - -if __name__ == '__main__': - params = {} - params["output_dir"] = "./checkpoint" - params["data_dir"] = "./data" - params["batch_bins"] = 2000 - params["dataset_type"] = "small" - params["max_epoch"] = 50 - params["lr"] = 0.00005 - params["model"] = "damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online" - params["model_revision"] = None - modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py deleted file mode 100644 index e53c37e60..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py +++ /dev/null @@ -1,13 +0,0 @@ -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -if __name__ == "__main__": - audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_id.wav" - output_dir = "./results" - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online", - output_dir=output_dir, - ) - rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"}) - print(rec_result) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/finetune.py deleted file mode 100644 index 5485ff56e..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/finetune.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer -from funasr.datasets.ms_dataset import MsDataset - - -def modelscope_finetune(params): - if not os.path.exists(params["output_dir"]): - os.makedirs(params["output_dir"], exist_ok=True) - # dataset split ["train", "validation"] - ds_dict = MsDataset.load(params["data_dir"]) - kwargs = dict( - model=params["model"], - model_revision=params["model_revision"], - data_dir=ds_dict, - dataset_type=params["dataset_type"], - work_dir=params["output_dir"], - batch_bins=params["batch_bins"], - max_epoch=params["max_epoch"], - lr=params["lr"]) - trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) - trainer.train() - - -if __name__ == '__main__': - params = {} - params["output_dir"] = "./checkpoint" - params["data_dir"] = "./data" - params["batch_bins"] = 2000 - params["dataset_type"] = "small" - params["max_epoch"] = 50 - params["lr"] = 0.00005 - params["model"] = "damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline" - params["model_revision"] = None - modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py deleted file mode 100644 index 68cc41d54..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py +++ /dev/null @@ -1,13 +0,0 @@ -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -if __name__ == "__main__": - audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ja.wav" - output_dir = "./results" - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline", - output_dir=output_dir, - ) - rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"}) - print(rec_result) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/finetune.py deleted file mode 100644 index fd9c44294..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/finetune.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer -from funasr.datasets.ms_dataset import MsDataset - - -def modelscope_finetune(params): - if not os.path.exists(params["output_dir"]): - os.makedirs(params["output_dir"], exist_ok=True) - # dataset split ["train", "validation"] - ds_dict = MsDataset.load(params["data_dir"]) - kwargs = dict( - model=params["model"], - model_revision=params["model_revision"], - data_dir=ds_dict, - dataset_type=params["dataset_type"], - work_dir=params["output_dir"], - batch_bins=params["batch_bins"], - max_epoch=params["max_epoch"], - lr=params["lr"]) - trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) - trainer.train() - - -if __name__ == '__main__': - params = {} - params["output_dir"] = "./checkpoint" - params["data_dir"] = "./data" - params["batch_bins"] = 2000 - params["dataset_type"] = "small" - params["max_epoch"] = 50 - params["lr"] = 0.00005 - params["model"] = "damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline" - params["model_revision"] = None - modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py deleted file mode 100644 index b87bcbb84..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py +++ /dev/null @@ -1,13 +0,0 @@ -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -if __name__ == "__main__": - audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ko.wav" - output_dir = "./results" - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline", - output_dir=output_dir, - ) - rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"}) - print(rec_result) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/finetune.py deleted file mode 100644 index 512b844c6..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/finetune.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer -from funasr.datasets.ms_dataset import MsDataset - - -def modelscope_finetune(params): - if not os.path.exists(params["output_dir"]): - os.makedirs(params["output_dir"], exist_ok=True) - # dataset split ["train", "validation"] - ds_dict = MsDataset.load(params["data_dir"]) - kwargs = dict( - model=params["model"], - model_revision=params["model_revision"], - data_dir=ds_dict, - dataset_type=params["dataset_type"], - work_dir=params["output_dir"], - batch_bins=params["batch_bins"], - max_epoch=params["max_epoch"], - lr=params["lr"]) - trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) - trainer.train() - - -if __name__ == '__main__': - params = {} - params["output_dir"] = "./checkpoint" - params["data_dir"] = "./data" - params["batch_bins"] = 2000 - params["dataset_type"] = "small" - params["max_epoch"] = 50 - params["lr"] = 0.00005 - params["model"] = "damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline" - params["model_revision"] = None - modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py deleted file mode 100644 index 4a43e7ce5..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py +++ /dev/null @@ -1,13 +0,0 @@ -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -if __name__ == "__main__": - audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_pt.wav" - output_dir = "./results" - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline", - output_dir=output_dir, - ) - rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"}) - print(rec_result) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/finetune.py deleted file mode 100644 index 432266dc8..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/finetune.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer -from funasr.datasets.ms_dataset import MsDataset - - -def modelscope_finetune(params): - if not os.path.exists(params["output_dir"]): - os.makedirs(params["output_dir"], exist_ok=True) - # dataset split ["train", "validation"] - ds_dict = MsDataset.load(params["data_dir"]) - kwargs = dict( - model=params["model"], - model_revision=params["model_revision"], - data_dir=ds_dict, - dataset_type=params["dataset_type"], - work_dir=params["output_dir"], - batch_bins=params["batch_bins"], - max_epoch=params["max_epoch"], - lr=params["lr"]) - trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) - trainer.train() - - -if __name__ == '__main__': - params = {} - params["output_dir"] = "./checkpoint" - params["data_dir"] = "./data" - params["batch_bins"] = 2000 - params["dataset_type"] = "small" - params["max_epoch"] = 50 - params["lr"] = 0.00005 - params["model"] = "damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline" - params["model_revision"] = None - modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py deleted file mode 100644 index 3c9d364e9..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py +++ /dev/null @@ -1,13 +0,0 @@ -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -if __name__ == "__main__": - audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ru.wav" - output_dir = "./results" - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline", - output_dir=output_dir, - ) - rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"}) - print(rec_result) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/finetune.py deleted file mode 100644 index 3a90ed21f..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/finetune.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer -from funasr.datasets.ms_dataset import MsDataset - - -def modelscope_finetune(params): - if not os.path.exists(params["output_dir"]): - os.makedirs(params["output_dir"], exist_ok=True) - # dataset split ["train", "validation"] - ds_dict = MsDataset.load(params["data_dir"]) - kwargs = dict( - model=params["model"], - model_revision=params["model_revision"], - data_dir=ds_dict, - dataset_type=params["dataset_type"], - work_dir=params["output_dir"], - batch_bins=params["batch_bins"], - max_epoch=params["max_epoch"], - lr=params["lr"]) - trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) - trainer.train() - - -if __name__ == '__main__': - params = {} - params["output_dir"] = "./checkpoint" - params["data_dir"] = "./data" - params["batch_bins"] = 2000 - params["dataset_type"] = "small" - params["max_epoch"] = 50 - params["lr"] = 0.00005 - params["model"] = "damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline" - params["model_revision"] = None - modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py deleted file mode 100644 index 4218f3d7a..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py +++ /dev/null @@ -1,13 +0,0 @@ -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -if __name__ == "__main__": - audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_vi.wav" - output_dir = "./results" - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline", - output_dir=output_dir, - ) - rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"}) - print(rec_result) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/README.md deleted file mode 100644 index c68a8cd4f..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/README.md +++ /dev/null @@ -1,30 +0,0 @@ -# ModelScope Model - -## How to finetune and infer using a pretrained Paraformer-large Model - -### Finetune - -- Modify finetune training related parameters in `finetune.py` - - output_dir: # result dir - - data_dir: # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text. - - batch_bins: # batch size - - max_epoch: # number of training epoch - - lr: # learning rate - -- Then you can run the pipeline to finetune with: -```python - python finetune.py -``` - -### Inference - -Or you can use the finetuned model for inference directly. - -- Setting parameters in `infer.py` - - audio_in: # support wav, url, bytes, and parsed audio format. - - output_dir: # If the input format is wav.scp, it needs to be set. - -- Then you can run the pipeline to infer with: -```python - python infer.py -``` diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/finetune.py deleted file mode 100644 index 73aae7dd9..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/finetune.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer -from funasr.datasets.ms_dataset import MsDataset - - -def modelscope_finetune(params): - if not os.path.exists(params.output_dir): - os.makedirs(params.output_dir, exist_ok=True) - # dataset split ["train", "validation"] - ds_dict = MsDataset.load(params.data_path) - kwargs = dict( - model=params.model, - model_revision=params.model_revision, - data_dir=ds_dict, - dataset_type=params.dataset_type, - work_dir=params.output_dir, - batch_bins=params.batch_bins, - max_epoch=params.max_epoch, - lr=params.lr) - trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) - trainer.train() - - -if __name__ == '__main__': - from funasr.utils.modelscope_param import modelscope_args - params = modelscope_args(model="damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline", data_path="./data") - params.output_dir = "./checkpoint" # m模型保存路径 - params.data_path = "./example_data/" # 数据路径 - params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large - params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒, - params.max_epoch = 20 # 最大训练轮数 - params.lr = 0.00005 # 设置学习率 - - modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/infer.py deleted file mode 100644 index 35209896c..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/infer.py +++ /dev/null @@ -1,14 +0,0 @@ -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -if __name__ == '__main__': - audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav' - output_dir = None - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline", - output_dir=output_dir, - ) - rec_result = inference_pipeline(audio_in=audio_in) - print(rec_result) - diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md deleted file mode 100644 index 9a84f9b57..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md +++ /dev/null @@ -1,53 +0,0 @@ -# ModelScope Model - -## How to finetune and infer using a pretrained UniASR Model - -### Finetune - -- Modify finetune training related parameters in `finetune.py` - - output_dir: # result dir - - data_dir: # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text` - - dataset_type: # for dataset larger than 1000 hours, set as `large`, otherwise set as `small` - - batch_bins: # batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms - - max_epoch: # number of training epoch - - lr: # learning rate - -- Then you can run the pipeline to finetune with: -```python - python finetune.py -``` - -### Inference - -Or you can use the finetuned model for inference directly. - -- Setting parameters in `infer.py` - - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed - - output_dir: # result dir - - ngpu: # the number of GPUs for decoding - - njob: # the number of jobs for each GPU - -- Then you can run the pipeline to infer with: -```python - python infer.py -``` - -- Results - -The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set. - -### Inference using local finetuned model - -- Modify inference related parameters in `infer_after_finetune.py` - - output_dir: # result dir - - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed - - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb` - -- Then you can run the pipeline to finetune with: -```python - python infer_after_finetune.py -``` - -- Results - -The decoding results can be found in `$output_dir/decoding_results/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set. diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/finetune.py deleted file mode 100644 index b2325b2bb..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/finetune.py +++ /dev/null @@ -1,36 +0,0 @@ -import os - -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer - -from funasr.datasets.ms_dataset import MsDataset -from funasr.utils.modelscope_param import modelscope_args - - -def modelscope_finetune(params): - if not os.path.exists(params.output_dir): - os.makedirs(params.output_dir, exist_ok=True) - # dataset split ["train", "validation"] - ds_dict = MsDataset.load(params.data_path) - kwargs = dict( - model=params.model, - data_dir=ds_dict, - dataset_type=params.dataset_type, - work_dir=params.output_dir, - batch_bins=params.batch_bins, - max_epoch=params.max_epoch, - lr=params.lr) - trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) - trainer.train() - - -if __name__ == '__main__': - params = modelscope_args(model="damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline", data_path="./data") - params.output_dir = "./checkpoint" # m模型保存路径 - params.data_path = "./example_data/" # 数据路径 - params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large - params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒, - params.max_epoch = 20 # 最大训练轮数 - params.lr = 0.00005 # 设置学习率 - - modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py deleted file mode 100644 index 13d2a2e37..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py +++ /dev/null @@ -1,87 +0,0 @@ -import os -import shutil -from multiprocessing import Pool - -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -from funasr.utils.compute_wer import compute_wer - - -def modelscope_infer_core(output_dir, split_dir, njob, idx): - output_dir_job = os.path.join(output_dir, "output.{}".format(idx)) - gpu_id = (int(idx) - 1) // njob - if "CUDA_VISIBLE_DEVICES" in os.environ.keys(): - gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",") - os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id]) - else: - os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline", - output_dir=output_dir_job, - batch_size=1 - ) - audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx)) - inference_pipeline(audio_in=audio_in) - -def modelscope_infer(params): - # prepare for multi-GPU decoding - ngpu = params["ngpu"] - njob = params["njob"] - output_dir = params["output_dir"] - if os.path.exists(output_dir): - shutil.rmtree(output_dir) - os.mkdir(output_dir) - split_dir = os.path.join(output_dir, "split") - os.mkdir(split_dir) - nj = ngpu * njob - wav_scp_file = os.path.join(params["data_dir"], "wav.scp") - with open(wav_scp_file) as f: - lines = f.readlines() - num_lines = len(lines) - num_job_lines = num_lines // nj - start = 0 - for i in range(nj): - end = start + num_job_lines - file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1))) - with open(file, "w") as f: - if i == nj - 1: - f.writelines(lines[start:]) - else: - f.writelines(lines[start:end]) - start = end - - p = Pool(nj) - for i in range(nj): - p.apply_async(modelscope_infer_core, - args=(output_dir, split_dir, njob, str(i + 1))) - p.close() - p.join() - - # combine decoding results - best_recog_path = os.path.join(output_dir, "1best_recog") - os.mkdir(best_recog_path) - files = ["text", "token", "score"] - for file in files: - with open(os.path.join(best_recog_path, file), "w") as f: - for i in range(nj): - job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file) - with open(job_file) as f_job: - lines = f_job.readlines() - f.writelines(lines) - - # If text exists, compute CER - text_in = os.path.join(params["data_dir"], "text") - if os.path.exists(text_in): - text_proc_file = os.path.join(best_recog_path, "text") - compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer")) - - -if __name__ == "__main__": - params = {} - params["data_dir"] = "./data/test" - params["output_dir"] = "./results" - params["ngpu"] = 1 - params["njob"] = 1 - modelscope_infer(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py deleted file mode 100644 index 1e9c4d1f0..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py +++ /dev/null @@ -1,104 +0,0 @@ -import json -import os -import shutil - -from multiprocessing import Pool -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -from funasr.utils.compute_wer import compute_wer - - -def modelscope_infer_after_finetune_core(model_dir, output_dir, split_dir, njob, idx): - output_dir_job = os.path.join(output_dir, "output.{}".format(idx)) - gpu_id = (int(idx) - 1) // njob - if "CUDA_VISIBLE_DEVICES" in os.environ.keys(): - gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",") - os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id]) - else: - os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model=model_dir, - output_dir=output_dir_job, - batch_size=1 - ) - audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx)) - inference_pipeline(audio_in=audio_in) - -def modelscope_infer_after_finetune(params): - # prepare for multi-GPU decoding - model_dir = params["model_dir"] - pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"]) - for file_name in params["required_files"]: - if file_name == "configuration.json": - with open(os.path.join(pretrained_model_path, file_name)) as f: - config_dict = json.load(f) - config_dict["model"]["am_model_name"] = params["decoding_model_name"] - with open(os.path.join(model_dir, "configuration.json"), "w") as f: - json.dump(config_dict, f, indent=4, separators=(',', ': ')) - else: - shutil.copy(os.path.join(pretrained_model_path, file_name), - os.path.join(model_dir, file_name)) - ngpu = params["ngpu"] - njob = params["njob"] - output_dir = params["output_dir"] - if os.path.exists(output_dir): - shutil.rmtree(output_dir) - os.mkdir(output_dir) - split_dir = os.path.join(output_dir, "split") - os.mkdir(split_dir) - nj = ngpu * njob - wav_scp_file = os.path.join(params["data_dir"], "wav.scp") - with open(wav_scp_file) as f: - lines = f.readlines() - num_lines = len(lines) - num_job_lines = num_lines // nj - start = 0 - for i in range(nj): - end = start + num_job_lines - file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1))) - with open(file, "w") as f: - if i == nj - 1: - f.writelines(lines[start:]) - else: - f.writelines(lines[start:end]) - start = end - - p = Pool(nj) - for i in range(nj): - p.apply_async(modelscope_infer_after_finetune_core, - args=(model_dir, output_dir, split_dir, njob, str(i + 1))) - p.close() - p.join() - - # combine decoding results - best_recog_path = os.path.join(output_dir, "1best_recog") - os.mkdir(best_recog_path) - files = ["text", "token", "score"] - for file in files: - with open(os.path.join(best_recog_path, file), "w") as f: - for i in range(nj): - job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file) - with open(job_file) as f_job: - lines = f_job.readlines() - f.writelines(lines) - - # If text exists, compute CER - text_in = os.path.join(params["data_dir"], "text") - if os.path.exists(text_in): - text_proc_file = os.path.join(best_recog_path, "token") - compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer")) - -if __name__ == '__main__': - params = {} - params["modelscope_model_name"] = "damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline" - params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"] - params["model_dir"] = "./checkpoint" - params["output_dir"] = "./results" - params["data_dir"] = "./data/test" - params["decoding_model_name"] = "20epoch.pb" - params["ngpu"] = 1 - params["njob"] = 1 - modelscope_infer_after_finetune(params) - diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/README.md deleted file mode 100644 index c68a8cd4f..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/README.md +++ /dev/null @@ -1,30 +0,0 @@ -# ModelScope Model - -## How to finetune and infer using a pretrained Paraformer-large Model - -### Finetune - -- Modify finetune training related parameters in `finetune.py` - - output_dir: # result dir - - data_dir: # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text. - - batch_bins: # batch size - - max_epoch: # number of training epoch - - lr: # learning rate - -- Then you can run the pipeline to finetune with: -```python - python finetune.py -``` - -### Inference - -Or you can use the finetuned model for inference directly. - -- Setting parameters in `infer.py` - - audio_in: # support wav, url, bytes, and parsed audio format. - - output_dir: # If the input format is wav.scp, it needs to be set. - -- Then you can run the pipeline to infer with: -```python - python infer.py -``` diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/finetune.py deleted file mode 100644 index b18296ecc..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/finetune.py +++ /dev/null @@ -1,35 +0,0 @@ -import os -from modelscope.metainfo import Trainers -from modelscope.trainers import build_trainer -from funasr.datasets.ms_dataset import MsDataset - - -def modelscope_finetune(params): - if not os.path.exists(params.output_dir): - os.makedirs(params.output_dir, exist_ok=True) - # dataset split ["train", "validation"] - ds_dict = MsDataset.load(params.data_path) - kwargs = dict( - model=params.model, - model_revision=params.model_revision, - data_dir=ds_dict, - dataset_type=params.dataset_type, - work_dir=params.output_dir, - batch_bins=params.batch_bins, - max_epoch=params.max_epoch, - lr=params.lr) - trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) - trainer.train() - - -if __name__ == '__main__': - from funasr.utils.modelscope_param import modelscope_args - params = modelscope_args(model="damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline", data_path="./data") - params.output_dir = "./checkpoint" # m模型保存路径 - params.data_path = "./example_data/" # 数据路径 - params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large - params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒, - params.max_epoch = 20 # 最大训练轮数 - params.lr = 0.00005 # 设置学习率 - - modelscope_finetune(params) diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/infer.py deleted file mode 100644 index 8ec42885d..000000000 --- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/infer.py +++ /dev/null @@ -1,14 +0,0 @@ -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -if __name__ == '__main__': - audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav' - output_dir = None - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline", - output_dir=output_dir, - ) - rec_result = inference_pipeline(audio_in=audio_in) - print(rec_result) - diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py index 20994d39c..45b5e331e 100644 --- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py +++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py @@ -1,12 +1,12 @@ -##################text.scp文件路径################### -inputs = "./egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt" +##################text.scp################### +# inputs = "./egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt" -##################text二进制数据##################### +##################text##################### #inputs = "我们都是木头人不会讲话不会动" -##################text文件url####################### -#inputs = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt" +##################text file url####################### +inputs = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt" from modelscope.pipelines import pipeline diff --git a/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer.py b/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/demo.py similarity index 100% rename from egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer.py rename to egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/demo.py diff --git a/egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/infer.py b/egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/demo.py similarity index 100% rename from egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/infer.py rename to egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/demo.py diff --git a/funasr/bin/asr_infer.py b/funasr/bin/asr_infer.py index dce9ee009..488be16ee 100644 --- a/funasr/bin/asr_infer.py +++ b/funasr/bin/asr_infer.py @@ -46,11 +46,12 @@ from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaform from funasr.models.e2e_asr_contextual_paraformer import NeatContextualParaformer from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard -from funasr.bin.tp_inference import SpeechText2Timestamp +from funasr.bin.tp_infer import Speech2Timestamp from funasr.bin.vad_inference import Speech2VadSegment -from funasr.bin.punctuation_infer import Text2Punc +from funasr.bin.punc_infer import Text2Punc from funasr.utils.vad_utils import slice_padding_fbank from funasr.tasks.vad import VADTask + from funasr.utils.timestamp_tools import time_stamp_sentence, ts_prediction_lfr6_standard @@ -616,6 +617,7 @@ class Speech2TextParaformerOnline: # 1. Build ASR model scorers = {} + from funasr.tasks.asr import ASRTaskParaformer as ASRTask asr_model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, cmvn_file, device ) diff --git a/funasr/bin/asr_inference.py b/funasr/bin/asr_inference.py deleted file mode 100644 index f70382bf1..000000000 --- a/funasr/bin/asr_inference.py +++ /dev/null @@ -1,592 +0,0 @@ -#!/usr/bin/env python3 -# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved. -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -import argparse -import logging -import sys -from pathlib import Path -from typing import Any -from typing import List -from typing import Optional -from typing import Sequence -from typing import Tuple -from typing import Union -from typing import Dict - -import numpy as np -import torch -from typeguard import check_argument_types -from typeguard import check_return_type - -from funasr.fileio.datadir_writer import DatadirWriter -from funasr.modules.beam_search.batch_beam_search import BatchBeamSearch -from funasr.modules.beam_search.batch_beam_search_online_sim import BatchBeamSearchOnlineSim -from funasr.modules.beam_search.beam_search import BeamSearch -from funasr.modules.beam_search.beam_search import Hypothesis -from funasr.modules.scorers.ctc import CTCPrefixScorer -from funasr.modules.scorers.length_bonus import LengthBonus -from funasr.modules.scorers.scorer_interface import BatchScorerInterface -from funasr.modules.subsampling import TooShortUttError -from funasr.tasks.asr import ASRTask -from funasr.tasks.lm import LMTask -from funasr.text.build_tokenizer import build_tokenizer -from funasr.text.token_id_converter import TokenIDConverter -from funasr.torch_utils.device_funcs import to_device -from funasr.torch_utils.set_all_random_seed import set_all_random_seed -from funasr.utils import config_argparse -from funasr.utils.cli_utils import get_commandline_args -from funasr.utils.types import str2bool -from funasr.utils.types import str2triple_str -from funasr.utils.types import str_or_none -from funasr.utils import asr_utils, wav_utils, postprocess_utils -from funasr.models.frontend.wav_frontend import WavFrontend -from funasr.tasks.asr import frontend_choices - - -header_colors = '\033[95m' -end_colors = '\033[0m' - - -class Speech2Text: - """Speech2Text class - - Examples: - >>> import soundfile - >>> speech2text = Speech2Text("asr_config.yml", "asr.pb") - >>> audio, rate = soundfile.read("speech.wav") - >>> speech2text(audio) - [(text, token, token_int, hypothesis object), ...] - - """ - - def __init__( - self, - asr_train_config: Union[Path, str] = None, - asr_model_file: Union[Path, str] = None, - cmvn_file: Union[Path, str] = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, - device: str = "cpu", - maxlenratio: float = 0.0, - minlenratio: float = 0.0, - batch_size: int = 1, - dtype: str = "float32", - beam_size: int = 20, - ctc_weight: float = 0.5, - lm_weight: float = 1.0, - ngram_weight: float = 0.9, - penalty: float = 0.0, - nbest: int = 1, - streaming: bool = False, - frontend_conf: dict = None, - **kwargs, - ): - assert check_argument_types() - - # 1. Build ASR model - scorers = {} - asr_model, asr_train_args = ASRTask.build_model_from_file( - asr_train_config, asr_model_file, cmvn_file, device - ) - frontend = None - if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None: - if asr_train_args.frontend=='wav_frontend': - frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf) - else: - frontend_class=frontend_choices.get_class(asr_train_args.frontend) - frontend = frontend_class(**asr_train_args.frontend_conf).eval() - - logging.info("asr_model: {}".format(asr_model)) - logging.info("asr_train_args: {}".format(asr_train_args)) - asr_model.to(dtype=getattr(torch, dtype)).eval() - - decoder = asr_model.decoder - - ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) - token_list = asr_model.token_list - scorers.update( - decoder=decoder, - ctc=ctc, - length_bonus=LengthBonus(len(token_list)), - ) - - # 2. Build Language model - if lm_train_config is not None: - lm, lm_train_args = LMTask.build_model_from_file( - lm_train_config, lm_file, None, device - ) - scorers["lm"] = lm.lm - - # 3. Build ngram model - # ngram is not supported now - ngram = None - scorers["ngram"] = ngram - - # 4. Build BeamSearch object - # transducer is not supported now - beam_search_transducer = None - - weights = dict( - decoder=1.0 - ctc_weight, - ctc=ctc_weight, - lm=lm_weight, - ngram=ngram_weight, - length_bonus=penalty, - ) - beam_search = BeamSearch( - beam_size=beam_size, - weights=weights, - scorers=scorers, - sos=asr_model.sos, - eos=asr_model.eos, - vocab_size=len(token_list), - token_list=token_list, - pre_beam_score_key=None if ctc_weight == 1.0 else "full", - ) - - # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text - if token_type is None: - token_type = asr_train_args.token_type - if bpemodel is None: - bpemodel = asr_train_args.bpemodel - - if token_type is None: - tokenizer = None - elif token_type == "bpe": - if bpemodel is not None: - tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) - else: - tokenizer = None - else: - tokenizer = build_tokenizer(token_type=token_type) - converter = TokenIDConverter(token_list=token_list) - logging.info(f"Text tokenizer: {tokenizer}") - - self.asr_model = asr_model - self.asr_train_args = asr_train_args - self.converter = converter - self.tokenizer = tokenizer - self.beam_search = beam_search - self.beam_search_transducer = beam_search_transducer - self.maxlenratio = maxlenratio - self.minlenratio = minlenratio - self.device = device - self.dtype = dtype - self.nbest = nbest - self.frontend = frontend - - @torch.no_grad() - def __call__( - self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None - ) -> List[ - Tuple[ - Optional[str], - List[str], - List[int], - Union[Hypothesis], - ] - ]: - """Inference - - Args: - speech: Input speech data - Returns: - text, token, token_int, hyp - - """ - assert check_argument_types() - - # Input as audio signal - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - - if self.frontend is not None: - feats, feats_len = self.frontend.forward(speech, speech_lengths) - feats = to_device(feats, device=self.device) - feats_len = feats_len.int() - self.asr_model.frontend = None - else: - feats = speech - feats_len = speech_lengths - lfr_factor = max(1, (feats.size()[-1] // 80) - 1) - batch = {"speech": feats, "speech_lengths": feats_len} - - # a. To device - batch = to_device(batch, device=self.device) - - # b. Forward Encoder - enc, _ = self.asr_model.encode(**batch) - if isinstance(enc, tuple): - enc = enc[0] - assert len(enc) == 1, len(enc) - - # c. Passed the encoder result and the beam search - nbest_hyps = self.beam_search( - x=enc[0], maxlenratio=self.maxlenratio, minlenratio=self.minlenratio - ) - - nbest_hyps = nbest_hyps[: self.nbest] - - results = [] - for hyp in nbest_hyps: - assert isinstance(hyp, (Hypothesis)), type(hyp) - - # remove sos/eos and get results - last_pos = -1 - if isinstance(hyp.yseq, list): - token_int = hyp.yseq[1:last_pos] - else: - token_int = hyp.yseq[1:last_pos].tolist() - - # remove blank symbol id, which is assumed to be 0 - token_int = list(filter(lambda x: x != 0, token_int)) - - # Change integer-ids to tokens - token = self.converter.ids2tokens(token_int) - - if self.tokenizer is not None: - text = self.tokenizer.tokens2text(token) - else: - text = None - results.append((text, token, token_int, hyp)) - - assert check_return_type(results) - return results - - - -def inference_modelscope( - maxlenratio: float, - minlenratio: float, - batch_size: int, - beam_size: int, - ngpu: int, - ctc_weight: float, - lm_weight: float, - penalty: float, - log_level: Union[int, str], - # data_path_and_name_and_type, - asr_train_config: Optional[str], - asr_model_file: Optional[str], - cmvn_file: Optional[str] = None, - lm_train_config: Optional[str] = None, - lm_file: Optional[str] = None, - token_type: Optional[str] = None, - key_file: Optional[str] = None, - word_lm_train_config: Optional[str] = None, - bpemodel: Optional[str] = None, - allow_variable_data_keys: bool = False, - streaming: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - ngram_weight: float = 0.9, - nbest: int = 1, - num_workers: int = 1, - mc: bool = False, - param_dict: dict = None, - **kwargs, -): - assert check_argument_types() - ncpu = kwargs.get("ncpu", 1) - torch.set_num_threads(ncpu) - if batch_size > 1: - raise NotImplementedError("batch decoding is not implemented") - if word_lm_train_config is not None: - raise NotImplementedError("Word LM is not implemented") - if ngpu > 1: - raise NotImplementedError("only single GPU decoding is supported") - - for handler in logging.root.handlers[:]: - logging.root.removeHandler(handler) - - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - - # 1. Set random-seed - set_all_random_seed(seed) - - # 2. Build speech2text - speech2text_kwargs = dict( - asr_train_config=asr_train_config, - asr_model_file=asr_model_file, - cmvn_file=cmvn_file, - lm_train_config=lm_train_config, - lm_file=lm_file, - token_type=token_type, - bpemodel=bpemodel, - device=device, - maxlenratio=maxlenratio, - minlenratio=minlenratio, - dtype=dtype, - beam_size=beam_size, - ctc_weight=ctc_weight, - lm_weight=lm_weight, - ngram_weight=ngram_weight, - penalty=penalty, - nbest=nbest, - streaming=streaming, - ) - logging.info("speech2text_kwargs: {}".format(speech2text_kwargs)) - speech2text = Speech2Text(**speech2text_kwargs) - - def _forward(data_path_and_name_and_type, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - output_dir_v2: Optional[str] = None, - fs: dict = None, - param_dict: dict = None, - **kwargs, - ): - # 3. Build data-iterator - if data_path_and_name_and_type is None and raw_inputs is not None: - if isinstance(raw_inputs, torch.Tensor): - raw_inputs = raw_inputs.numpy() - data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] - loader = ASRTask.build_streaming_iterator( - data_path_and_name_and_type, - dtype=dtype, - fs=fs, - mc=mc, - batch_size=batch_size, - key_file=key_file, - num_workers=num_workers, - preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), - collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), - allow_variable_data_keys=allow_variable_data_keys, - inference=True, - ) - - finish_count = 0 - file_count = 1 - # 7 .Start for-loop - # FIXME(kamo): The output format should be discussed about - asr_result_list = [] - output_path = output_dir_v2 if output_dir_v2 is not None else output_dir - if output_path is not None: - writer = DatadirWriter(output_path) - else: - writer = None - - for keys, batch in loader: - assert isinstance(batch, dict), type(batch) - assert all(isinstance(s, str) for s in keys), keys - _bs = len(next(iter(batch.values()))) - assert len(keys) == _bs, f"{len(keys)} != {_bs}" - # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} - - # N-best list of (text, token, token_int, hyp_object) - try: - results = speech2text(**batch) - except TooShortUttError as e: - logging.warning(f"Utterance {keys} {e}") - hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) - results = [[" ", ["sil"], [2], hyp]] * nbest - - # Only supporting batch_size==1 - key = keys[0] - for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results): - # Create a directory: outdir/{n}best_recog - if writer is not None: - ibest_writer = writer[f"{n}best_recog"] - - # Write the result to each file - ibest_writer["token"][key] = " ".join(token) - ibest_writer["token_int"][key] = " ".join(map(str, token_int)) - ibest_writer["score"][key] = str(hyp.score) - - if text is not None: - text_postprocessed, _ = postprocess_utils.sentence_postprocess(token) - item = {'key': key, 'value': text_postprocessed} - asr_result_list.append(item) - finish_count += 1 - asr_utils.print_progress(finish_count / file_count) - if writer is not None: - ibest_writer["text"][key] = text - - logging.info("uttid: {}".format(key)) - logging.info("text predictions: {}\n".format(text)) - return asr_result_list - - return _forward - -def get_parser(): - parser = config_argparse.ArgumentParser( - description="ASR Decoding", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Note(kamo): Use '_' instead of '-' as separator. - # '-' is confusing if written in yaml. - parser.add_argument( - "--log_level", - type=lambda x: x.upper(), - default="INFO", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), - help="The verbose level of logging", - ) - - parser.add_argument("--output_dir", type=str, required=True) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - parser.add_argument( - "--gpuid_list", - type=str, - default="", - help="The visible gpus", - ) - parser.add_argument("--seed", type=int, default=0, help="Random seed") - parser.add_argument( - "--dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type", - ) - parser.add_argument( - "--num_workers", - type=int, - default=1, - help="The number of workers used for DataLoader", - ) - - group = parser.add_argument_group("Input data related") - group.add_argument( - "--data_path_and_name_and_type", - type=str2triple_str, - required=False, - action="append", - ) - group.add_argument("--raw_inputs", type=list, default=None) - # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}]) - group.add_argument("--key_file", type=str_or_none) - group.add_argument("--allow_variable_data_keys", type=str2bool, default=False) - - group = parser.add_argument_group("The model configuration related") - group.add_argument( - "--asr_train_config", - type=str, - help="ASR training configuration", - ) - group.add_argument( - "--asr_model_file", - type=str, - help="ASR model parameter file", - ) - group.add_argument( - "--cmvn_file", - type=str, - help="Global cmvn file", - ) - group.add_argument( - "--lm_train_config", - type=str, - help="LM training configuration", - ) - group.add_argument( - "--lm_file", - type=str, - help="LM parameter file", - ) - group.add_argument( - "--word_lm_train_config", - type=str, - help="Word LM training configuration", - ) - group.add_argument( - "--word_lm_file", - type=str, - help="Word LM parameter file", - ) - group.add_argument( - "--ngram_file", - type=str, - help="N-gram parameter file", - ) - group.add_argument( - "--model_tag", - type=str, - help="Pretrained model tag. If specify this option, *_train_config and " - "*_file will be overwritten", - ) - - group = parser.add_argument_group("Beam-search related") - group.add_argument( - "--batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses") - group.add_argument("--beam_size", type=int, default=20, help="Beam size") - group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty") - group.add_argument( - "--maxlenratio", - type=float, - default=0.0, - help="Input length ratio to obtain max output length. " - "If maxlenratio=0.0 (default), it uses a end-detect " - "function " - "to automatically find maximum hypothesis lengths." - "If maxlenratio<0.0, its absolute value is interpreted" - "as a constant max output length", - ) - group.add_argument( - "--minlenratio", - type=float, - default=0.0, - help="Input length ratio to obtain min output length", - ) - group.add_argument( - "--ctc_weight", - type=float, - default=0.5, - help="CTC weight in joint decoding", - ) - group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight") - group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight") - group.add_argument("--streaming", type=str2bool, default=False) - - group = parser.add_argument_group("Text converter related") - group.add_argument( - "--token_type", - type=str_or_none, - default=None, - choices=["char", "bpe", None], - help="The token type for ASR model. " - "If not given, refers from the training args", - ) - group.add_argument( - "--bpemodel", - type=str_or_none, - default=None, - help="The model path of sentencepiece. " - "If not given, refers from the training args", - ) - - return parser - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - parser = get_parser() - args = parser.parse_args(cmd) - kwargs = vars(args) - kwargs.pop("config", None) - inference(**kwargs) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py index 6ad17f0c6..18700321c 100644 --- a/funasr/bin/asr_inference_launch.py +++ b/funasr/bin/asr_inference_launch.py @@ -852,7 +852,7 @@ def inference_uniasr( decoding_ind=decoding_ind, decoding_mode=decoding_mode, ) - speech2text = Speech2Text(**speech2text_kwargs) + speech2text = Speech2TextUniASR(**speech2text_kwargs) def _forward(data_path_and_name_and_type, raw_inputs: Union[np.ndarray, torch.Tensor] = None, diff --git a/funasr/bin/asr_inference_mfcca.py b/funasr/bin/asr_inference_mfcca.py deleted file mode 100644 index e83286958..000000000 --- a/funasr/bin/asr_inference_mfcca.py +++ /dev/null @@ -1,767 +0,0 @@ -#!/usr/bin/env python3 -# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved. -# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) - -import argparse -import logging -import sys -from pathlib import Path -from typing import Any -from typing import List -from typing import Optional -from typing import Sequence -from typing import Tuple -from typing import Union -from typing import Dict - -import numpy as np -import torch -from typeguard import check_argument_types -from typeguard import check_return_type - -from funasr.fileio.datadir_writer import DatadirWriter -from funasr.modules.beam_search.batch_beam_search import BatchBeamSearch -from funasr.modules.beam_search.beam_search import BeamSearch -from funasr.modules.beam_search.beam_search import Hypothesis -from funasr.modules.scorers.ctc import CTCPrefixScorer -from funasr.modules.scorers.length_bonus import LengthBonus -from funasr.modules.scorers.scorer_interface import BatchScorerInterface -from funasr.modules.subsampling import TooShortUttError -from funasr.tasks.asr import ASRTaskMFCCA as ASRTask -from funasr.tasks.lm import LMTask -from funasr.text.build_tokenizer import build_tokenizer -from funasr.text.token_id_converter import TokenIDConverter -from funasr.torch_utils.device_funcs import to_device -from funasr.torch_utils.set_all_random_seed import set_all_random_seed -from funasr.utils import config_argparse -from funasr.utils.cli_utils import get_commandline_args -from funasr.utils.types import str2bool -from funasr.utils.types import str2triple_str -from funasr.utils.types import str_or_none -from funasr.utils import asr_utils, wav_utils, postprocess_utils -import pdb - - -global_asr_language: str = 'zh-cn' -global_sample_rate: Union[int, Dict[Any, int]] = { - 'audio_fs': 16000, - 'model_fs': 16000 -} - -class Speech2Text: - """Speech2Text class - - Examples: - >>> import soundfile - >>> speech2text = Speech2Text("asr_config.yml", "asr.pb") - >>> audio, rate = soundfile.read("speech.wav") - >>> speech2text(audio) - [(text, token, token_int, hypothesis object), ...] - - """ - - def __init__( - self, - asr_train_config: Union[Path, str] = None, - asr_model_file: Union[Path, str] = None, - cmvn_file: Union[Path, str] = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, - device: str = "cpu", - maxlenratio: float = 0.0, - minlenratio: float = 0.0, - batch_size: int = 1, - dtype: str = "float32", - beam_size: int = 20, - ctc_weight: float = 0.5, - lm_weight: float = 1.0, - ngram_weight: float = 0.9, - penalty: float = 0.0, - nbest: int = 1, - streaming: bool = False, - **kwargs, - ): - assert check_argument_types() - - # 1. Build ASR model - scorers = {} - asr_model, asr_train_args = ASRTask.build_model_from_file( - asr_train_config, asr_model_file, cmvn_file, device - ) - - logging.info("asr_model: {}".format(asr_model)) - logging.info("asr_train_args: {}".format(asr_train_args)) - asr_model.to(dtype=getattr(torch, dtype)).eval() - - decoder = asr_model.decoder - - ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) - token_list = asr_model.token_list - scorers.update( - decoder=decoder, - ctc=ctc, - length_bonus=LengthBonus(len(token_list)), - ) - - # 2. Build Language model - if lm_train_config is not None: - lm, lm_train_args = LMTask.build_model_from_file( - lm_train_config, lm_file, device - ) - lm.to(device) - scorers["lm"] = lm.lm - # 3. Build ngram model - # ngram is not supported now - ngram = None - scorers["ngram"] = ngram - - # 4. Build BeamSearch object - # transducer is not supported now - beam_search_transducer = None - - weights = dict( - decoder=1.0 - ctc_weight, - ctc=ctc_weight, - lm=lm_weight, - ngram=ngram_weight, - length_bonus=penalty, - ) - beam_search = BeamSearch( - beam_size=beam_size, - weights=weights, - scorers=scorers, - sos=asr_model.sos, - eos=asr_model.eos, - vocab_size=len(token_list), - token_list=token_list, - pre_beam_score_key=None if ctc_weight == 1.0 else "full", - ) - #beam_search.__class__ = BatchBeamSearch - # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text - if token_type is None: - token_type = asr_train_args.token_type - if bpemodel is None: - bpemodel = asr_train_args.bpemodel - - if token_type is None: - tokenizer = None - elif token_type == "bpe": - if bpemodel is not None: - tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) - else: - tokenizer = None - else: - tokenizer = build_tokenizer(token_type=token_type) - converter = TokenIDConverter(token_list=token_list) - logging.info(f"Text tokenizer: {tokenizer}") - - self.asr_model = asr_model - self.asr_train_args = asr_train_args - self.converter = converter - self.tokenizer = tokenizer - self.beam_search = beam_search - self.beam_search_transducer = beam_search_transducer - self.maxlenratio = maxlenratio - self.minlenratio = minlenratio - self.device = device - self.dtype = dtype - self.nbest = nbest - - @torch.no_grad() - def __call__( - self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None - ) -> List[ - Tuple[ - Optional[str], - List[str], - List[int], - Union[Hypothesis], - ] - ]: - """Inference - - Args: - speech: Input speech data - Returns: - text, token, token_int, hyp - - """ - assert check_argument_types() - # Input as audio signal - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - if(speech.dim()==3): - speech = torch.squeeze(speech, 2) - #speech = speech.unsqueeze(0).to(getattr(torch, self.dtype)) - speech = speech.to(getattr(torch, self.dtype)) - # lenghts: (1,) - lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1)) - batch = {"speech": speech, "speech_lengths": lengths} - - # a. To device - batch = to_device(batch, device=self.device) - - # b. Forward Encoder - enc, _ = self.asr_model.encode(**batch) - - assert len(enc) == 1, len(enc) - - # c. Passed the encoder result and the beam search - nbest_hyps = self.beam_search( - x=enc[0], maxlenratio=self.maxlenratio, minlenratio=self.minlenratio - ) - - nbest_hyps = nbest_hyps[: self.nbest] - - results = [] - for hyp in nbest_hyps: - assert isinstance(hyp, (Hypothesis)), type(hyp) - - # remove sos/eos and get results - last_pos = -1 - if isinstance(hyp.yseq, list): - token_int = hyp.yseq[1:last_pos] - else: - token_int = hyp.yseq[1:last_pos].tolist() - - # remove blank symbol id, which is assumed to be 0 - token_int = list(filter(lambda x: x != 0, token_int)) - - # Change integer-ids to tokens - token = self.converter.ids2tokens(token_int) - - if self.tokenizer is not None: - text = self.tokenizer.tokens2text(token) - else: - text = None - results.append((text, token, token_int, hyp)) - - assert check_return_type(results) - return results - - -# def inference( -# maxlenratio: float, -# minlenratio: float, -# batch_size: int, -# beam_size: int, -# ngpu: int, -# ctc_weight: float, -# lm_weight: float, -# penalty: float, -# log_level: Union[int, str], -# data_path_and_name_and_type, -# asr_train_config: Optional[str], -# asr_model_file: Optional[str], -# cmvn_file: Optional[str] = None, -# lm_train_config: Optional[str] = None, -# lm_file: Optional[str] = None, -# token_type: Optional[str] = None, -# key_file: Optional[str] = None, -# word_lm_train_config: Optional[str] = None, -# bpemodel: Optional[str] = None, -# allow_variable_data_keys: bool = False, -# streaming: bool = False, -# output_dir: Optional[str] = None, -# dtype: str = "float32", -# seed: int = 0, -# ngram_weight: float = 0.9, -# nbest: int = 1, -# num_workers: int = 1, -# **kwargs, -# ): -# assert check_argument_types() -# if batch_size > 1: -# raise NotImplementedError("batch decoding is not implemented") -# if word_lm_train_config is not None: -# raise NotImplementedError("Word LM is not implemented") -# if ngpu > 1: -# raise NotImplementedError("only single GPU decoding is supported") -# -# logging.basicConfig( -# level=log_level, -# format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", -# ) -# -# if ngpu >= 1 and torch.cuda.is_available(): -# device = "cuda" -# else: -# device = "cpu" -# -# # 1. Set random-seed -# set_all_random_seed(seed) -# -# # 2. Build speech2text -# speech2text_kwargs = dict( -# asr_train_config=asr_train_config, -# asr_model_file=asr_model_file, -# cmvn_file=cmvn_file, -# lm_train_config=lm_train_config, -# lm_file=lm_file, -# token_type=token_type, -# bpemodel=bpemodel, -# device=device, -# maxlenratio=maxlenratio, -# minlenratio=minlenratio, -# dtype=dtype, -# beam_size=beam_size, -# ctc_weight=ctc_weight, -# lm_weight=lm_weight, -# ngram_weight=ngram_weight, -# penalty=penalty, -# nbest=nbest, -# streaming=streaming, -# ) -# logging.info("speech2text_kwargs: {}".format(speech2text_kwargs)) -# speech2text = Speech2Text(**speech2text_kwargs) -# -# # 3. Build data-iterator -# loader = ASRTask.build_streaming_iterator( -# data_path_and_name_and_type, -# dtype=dtype, -# batch_size=batch_size, -# key_file=key_file, -# num_workers=num_workers, -# preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), -# collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), -# allow_variable_data_keys=allow_variable_data_keys, -# inference=True, -# ) -# -# finish_count = 0 -# file_count = 1 -# # 7 .Start for-loop -# # FIXME(kamo): The output format should be discussed about -# asr_result_list = [] -# if output_dir is not None: -# writer = DatadirWriter(output_dir) -# else: -# writer = None -# -# for keys, batch in loader: -# assert isinstance(batch, dict), type(batch) -# assert all(isinstance(s, str) for s in keys), keys -# _bs = len(next(iter(batch.values()))) -# assert len(keys) == _bs, f"{len(keys)} != {_bs}" -# #batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} -# -# # N-best list of (text, token, token_int, hyp_object) -# try: -# results = speech2text(**batch) -# except TooShortUttError as e: -# logging.warning(f"Utterance {keys} {e}") -# hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) -# results = [[" ", [""], [2], hyp]] * nbest -# -# # Only supporting batch_size==1 -# key = keys[0] -# for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results): -# # Create a directory: outdir/{n}best_recog -# if writer is not None: -# ibest_writer = writer[f"{n}best_recog"] -# -# # Write the result to each file -# ibest_writer["token"][key] = " ".join(token) -# ibest_writer["token_int"][key] = " ".join(map(str, token_int)) -# ibest_writer["score"][key] = str(hyp.score) -# -# if text is not None: -# text_postprocessed = postprocess_utils.sentence_postprocess(token) -# item = {'key': key, 'value': text_postprocessed} -# asr_result_list.append(item) -# finish_count += 1 -# asr_utils.print_progress(finish_count / file_count) -# if writer is not None: -# ibest_writer["text"][key] = text -# return asr_result_list - -def inference( - maxlenratio: float, - minlenratio: float, - batch_size: int, - beam_size: int, - ngpu: int, - ctc_weight: float, - lm_weight: float, - penalty: float, - log_level: Union[int, str], - data_path_and_name_and_type, - asr_train_config: Optional[str], - asr_model_file: Optional[str], - cmvn_file: Optional[str] = None, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - lm_train_config: Optional[str] = None, - lm_file: Optional[str] = None, - token_type: Optional[str] = None, - key_file: Optional[str] = None, - word_lm_train_config: Optional[str] = None, - bpemodel: Optional[str] = None, - allow_variable_data_keys: bool = False, - streaming: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - ngram_weight: float = 0.9, - nbest: int = 1, - num_workers: int = 1, - **kwargs, -): - inference_pipeline = inference_modelscope( - maxlenratio=maxlenratio, - minlenratio=minlenratio, - batch_size=batch_size, - beam_size=beam_size, - ngpu=ngpu, - ctc_weight=ctc_weight, - lm_weight=lm_weight, - penalty=penalty, - log_level=log_level, - asr_train_config=asr_train_config, - asr_model_file=asr_model_file, - cmvn_file=cmvn_file, - raw_inputs=raw_inputs, - lm_train_config=lm_train_config, - lm_file=lm_file, - token_type=token_type, - key_file=key_file, - word_lm_train_config=word_lm_train_config, - bpemodel=bpemodel, - allow_variable_data_keys=allow_variable_data_keys, - streaming=streaming, - output_dir=output_dir, - dtype=dtype, - seed=seed, - ngram_weight=ngram_weight, - nbest=nbest, - num_workers=num_workers, - **kwargs, - ) - return inference_pipeline(data_path_and_name_and_type, raw_inputs) - -def inference_modelscope( - maxlenratio: float, - minlenratio: float, - batch_size: int, - beam_size: int, - ngpu: int, - ctc_weight: float, - lm_weight: float, - penalty: float, - log_level: Union[int, str], - # data_path_and_name_and_type, - asr_train_config: Optional[str], - asr_model_file: Optional[str], - cmvn_file: Optional[str] = None, - lm_train_config: Optional[str] = None, - lm_file: Optional[str] = None, - token_type: Optional[str] = None, - key_file: Optional[str] = None, - word_lm_train_config: Optional[str] = None, - bpemodel: Optional[str] = None, - allow_variable_data_keys: bool = False, - streaming: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - ngram_weight: float = 0.9, - nbest: int = 1, - num_workers: int = 1, - param_dict: dict = None, - **kwargs, -): - assert check_argument_types() - ncpu = kwargs.get("ncpu", 1) - torch.set_num_threads(ncpu) - if batch_size > 1: - raise NotImplementedError("batch decoding is not implemented") - if word_lm_train_config is not None: - raise NotImplementedError("Word LM is not implemented") - if ngpu > 1: - raise NotImplementedError("only single GPU decoding is supported") - - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - - # 1. Set random-seed - set_all_random_seed(seed) - - # 2. Build speech2text - speech2text_kwargs = dict( - asr_train_config=asr_train_config, - asr_model_file=asr_model_file, - cmvn_file=cmvn_file, - lm_train_config=lm_train_config, - lm_file=lm_file, - token_type=token_type, - bpemodel=bpemodel, - device=device, - maxlenratio=maxlenratio, - minlenratio=minlenratio, - dtype=dtype, - beam_size=beam_size, - ctc_weight=ctc_weight, - lm_weight=lm_weight, - ngram_weight=ngram_weight, - penalty=penalty, - nbest=nbest, - streaming=streaming, - ) - logging.info("speech2text_kwargs: {}".format(speech2text_kwargs)) - speech2text = Speech2Text(**speech2text_kwargs) - - def _forward(data_path_and_name_and_type, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - output_dir_v2: Optional[str] = None, - fs: dict = None, - param_dict: dict = None, - **kwargs, - ): - # 3. Build data-iterator - if data_path_and_name_and_type is None and raw_inputs is not None: - if isinstance(raw_inputs, torch.Tensor): - raw_inputs = raw_inputs.numpy() - data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] - loader = ASRTask.build_streaming_iterator( - data_path_and_name_and_type, - dtype=dtype, - batch_size=batch_size, - fs=fs, - mc=True, - key_file=key_file, - num_workers=num_workers, - preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), - collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), - allow_variable_data_keys=allow_variable_data_keys, - inference=True, - ) - - finish_count = 0 - file_count = 1 - # 7 .Start for-loop - # FIXME(kamo): The output format should be discussed about - asr_result_list = [] - output_path = output_dir_v2 if output_dir_v2 is not None else output_dir - if output_path is not None: - writer = DatadirWriter(output_path) - else: - writer = None - - for keys, batch in loader: - assert isinstance(batch, dict), type(batch) - assert all(isinstance(s, str) for s in keys), keys - _bs = len(next(iter(batch.values()))) - assert len(keys) == _bs, f"{len(keys)} != {_bs}" - # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} - - # N-best list of (text, token, token_int, hyp_object) - try: - results = speech2text(**batch) - except TooShortUttError as e: - logging.warning(f"Utterance {keys} {e}") - hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) - results = [[" ", [""], [2], hyp]] * nbest - - # Only supporting batch_size==1 - key = keys[0] - for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results): - # Create a directory: outdir/{n}best_recog - if writer is not None: - ibest_writer = writer[f"{n}best_recog"] - - # Write the result to each file - ibest_writer["token"][key] = " ".join(token) - # ibest_writer["token_int"][key] = " ".join(map(str, token_int)) - ibest_writer["score"][key] = str(hyp.score) - - if text is not None: - text_postprocessed = postprocess_utils.sentence_postprocess(token) - item = {'key': key, 'value': text_postprocessed} - asr_result_list.append(item) - finish_count += 1 - asr_utils.print_progress(finish_count / file_count) - if writer is not None: - ibest_writer["text"][key] = text - return asr_result_list - - return _forward - -def get_parser(): - parser = config_argparse.ArgumentParser( - description="ASR Decoding", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Note(kamo): Use '_' instead of '-' as separator. - # '-' is confusing if written in yaml. - parser.add_argument( - "--log_level", - type=lambda x: x.upper(), - default="INFO", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), - help="The verbose level of logging", - ) - - parser.add_argument("--output_dir", type=str, required=True) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - parser.add_argument( - "--gpuid_list", - type=str, - default="", - help="The visible gpus", - ) - parser.add_argument("--seed", type=int, default=0, help="Random seed") - parser.add_argument( - "--dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type", - ) - parser.add_argument( - "--num_workers", - type=int, - default=1, - help="The number of workers used for DataLoader", - ) - - group = parser.add_argument_group("Input data related") - group.add_argument( - "--data_path_and_name_and_type", - type=str2triple_str, - required=False, - action="append", - ) - group.add_argument("--raw_inputs", type=list, default=None) - # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}]) - group.add_argument("--key_file", type=str_or_none) - group.add_argument("--allow_variable_data_keys", type=str2bool, default=False) - - group = parser.add_argument_group("The model configuration related") - group.add_argument( - "--asr_train_config", - type=str, - help="ASR training configuration", - ) - group.add_argument( - "--asr_model_file", - type=str, - help="ASR model parameter file", - ) - group.add_argument( - "--cmvn_file", - type=str, - help="Global cmvn file", - ) - group.add_argument( - "--lm_train_config", - type=str, - help="LM training configuration", - ) - group.add_argument( - "--lm_file", - type=str, - help="LM parameter file", - ) - group.add_argument( - "--word_lm_train_config", - type=str, - help="Word LM training configuration", - ) - group.add_argument( - "--word_lm_file", - type=str, - help="Word LM parameter file", - ) - group.add_argument( - "--ngram_file", - type=str, - help="N-gram parameter file", - ) - group.add_argument( - "--model_tag", - type=str, - help="Pretrained model tag. If specify this option, *_train_config and " - "*_file will be overwritten", - ) - - group = parser.add_argument_group("Beam-search related") - group.add_argument( - "--batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses") - group.add_argument("--beam_size", type=int, default=20, help="Beam size") - group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty") - group.add_argument( - "--maxlenratio", - type=float, - default=0.0, - help="Input length ratio to obtain max output length. " - "If maxlenratio=0.0 (default), it uses a end-detect " - "function " - "to automatically find maximum hypothesis lengths." - "If maxlenratio<0.0, its absolute value is interpreted" - "as a constant max output length", - ) - group.add_argument( - "--minlenratio", - type=float, - default=0.0, - help="Input length ratio to obtain min output length", - ) - group.add_argument( - "--ctc_weight", - type=float, - default=0.5, - help="CTC weight in joint decoding", - ) - group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight") - group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight") - group.add_argument("--streaming", type=str2bool, default=False) - - group = parser.add_argument_group("Text converter related") - group.add_argument( - "--token_type", - type=str_or_none, - default=None, - choices=["char", "bpe", None], - help="The token type for ASR model. " - "If not given, refers from the training args", - ) - group.add_argument( - "--bpemodel", - type=str_or_none, - default=None, - help="The model path of sentencepiece. " - "If not given, refers from the training args", - ) - - return parser - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - parser = get_parser() - args = parser.parse_args(cmd) - kwargs = vars(args) - kwargs.pop("config", None) - inference(**kwargs) - - -if __name__ == "__main__": - main() diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py deleted file mode 100644 index ecdb62abc..000000000 --- a/funasr/bin/asr_inference_paraformer.py +++ /dev/null @@ -1,1027 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import logging -import sys -import time -import copy -import os -import codecs -import tempfile -import requests -from pathlib import Path -from typing import Optional -from typing import Sequence -from typing import Tuple -from typing import Union -from typing import Dict -from typing import Any -from typing import List - -import numpy as np -import torch -from typeguard import check_argument_types - -from funasr.fileio.datadir_writer import DatadirWriter -from funasr.modules.beam_search.beam_search import BeamSearchPara as BeamSearch -from funasr.modules.beam_search.beam_search import Hypothesis -from funasr.modules.scorers.ctc import CTCPrefixScorer -from funasr.modules.scorers.length_bonus import LengthBonus -from funasr.modules.subsampling import TooShortUttError -from funasr.tasks.asr import ASRTaskParaformer as ASRTask -from funasr.tasks.lm import LMTask -from funasr.text.build_tokenizer import build_tokenizer -from funasr.text.token_id_converter import TokenIDConverter -from funasr.torch_utils.device_funcs import to_device -from funasr.torch_utils.set_all_random_seed import set_all_random_seed -from funasr.utils import config_argparse -from funasr.utils.cli_utils import get_commandline_args -from funasr.utils.types import str2bool -from funasr.utils.types import str2triple_str -from funasr.utils.types import str_or_none -from funasr.utils import asr_utils, wav_utils, postprocess_utils -from funasr.models.frontend.wav_frontend import WavFrontend -from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer -from funasr.models.e2e_asr_contextual_paraformer import NeatContextualParaformer -from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export -from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard -from funasr.bin.tp_inference import SpeechText2Timestamp -from funasr.bin.vad_inference import Speech2VadSegment -from funasr.bin.punctuation_infer import Text2Punc -from funasr.utils.vad_utils import slice_padding_fbank -from funasr.tasks.vad import VADTask -from funasr.utils.timestamp_tools import time_stamp_sentence, ts_prediction_lfr6_standard - -class Speech2Text: - """Speech2Text class - - Examples: - >>> import soundfile - >>> speech2text = Speech2Text("asr_config.yml", "asr.pb") - >>> audio, rate = soundfile.read("speech.wav") - >>> speech2text(audio) - [(text, token, token_int, hypothesis object), ...] - - """ - - def __init__( - self, - asr_train_config: Union[Path, str] = None, - asr_model_file: Union[Path, str] = None, - cmvn_file: Union[Path, str] = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, - device: str = "cpu", - maxlenratio: float = 0.0, - minlenratio: float = 0.0, - dtype: str = "float32", - beam_size: int = 20, - ctc_weight: float = 0.5, - lm_weight: float = 1.0, - ngram_weight: float = 0.9, - penalty: float = 0.0, - nbest: int = 1, - frontend_conf: dict = None, - hotword_list_or_file: str = None, - **kwargs, - ): - assert check_argument_types() - - # 1. Build ASR model - scorers = {} - asr_model, asr_train_args = ASRTask.build_model_from_file( - asr_train_config, asr_model_file, cmvn_file, device - ) - frontend = None - if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None: - frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf) - - logging.info("asr_model: {}".format(asr_model)) - logging.info("asr_train_args: {}".format(asr_train_args)) - asr_model.to(dtype=getattr(torch, dtype)).eval() - - if asr_model.ctc != None: - ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) - scorers.update( - ctc=ctc - ) - token_list = asr_model.token_list - scorers.update( - length_bonus=LengthBonus(len(token_list)), - ) - - # 2. Build Language model - if lm_train_config is not None: - lm, lm_train_args = LMTask.build_model_from_file( - lm_train_config, lm_file, device - ) - scorers["lm"] = lm.lm - - # 3. Build ngram model - # ngram is not supported now - ngram = None - scorers["ngram"] = ngram - - # 4. Build BeamSearch object - # transducer is not supported now - beam_search_transducer = None - - weights = dict( - decoder=1.0 - ctc_weight, - ctc=ctc_weight, - lm=lm_weight, - ngram=ngram_weight, - length_bonus=penalty, - ) - beam_search = BeamSearch( - beam_size=beam_size, - weights=weights, - scorers=scorers, - sos=asr_model.sos, - eos=asr_model.eos, - vocab_size=len(token_list), - token_list=token_list, - pre_beam_score_key=None if ctc_weight == 1.0 else "full", - ) - - beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() - for scorer in scorers.values(): - if isinstance(scorer, torch.nn.Module): - scorer.to(device=device, dtype=getattr(torch, dtype)).eval() - - logging.info(f"Decoding device={device}, dtype={dtype}") - - # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text - if token_type is None: - token_type = asr_train_args.token_type - if bpemodel is None: - bpemodel = asr_train_args.bpemodel - - if token_type is None: - tokenizer = None - elif token_type == "bpe": - if bpemodel is not None: - tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) - else: - tokenizer = None - else: - tokenizer = build_tokenizer(token_type=token_type) - converter = TokenIDConverter(token_list=token_list) - logging.info(f"Text tokenizer: {tokenizer}") - - self.asr_model = asr_model - self.asr_train_args = asr_train_args - self.converter = converter - self.tokenizer = tokenizer - - # 6. [Optional] Build hotword list from str, local file or url - self.hotword_list = None - self.hotword_list = self.generate_hotwords_list(hotword_list_or_file) - - is_use_lm = lm_weight != 0.0 and lm_file is not None - if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm: - beam_search = None - self.beam_search = beam_search - logging.info(f"Beam_search: {self.beam_search}") - self.beam_search_transducer = beam_search_transducer - self.maxlenratio = maxlenratio - self.minlenratio = minlenratio - self.device = device - self.dtype = dtype - self.nbest = nbest - self.frontend = frontend - self.encoder_downsampling_factor = 1 - if asr_train_args.encoder == "data2vec_encoder" or asr_train_args.encoder_conf["input_layer"] == "conv2d": - self.encoder_downsampling_factor = 4 - - @torch.no_grad() - def __call__( - self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None, - begin_time: int = 0, end_time: int = None, - ): - """Inference - - Args: - speech: Input speech data - Returns: - text, token, token_int, hyp - - """ - assert check_argument_types() - - # Input as audio signal - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - - if self.frontend is not None: - feats, feats_len = self.frontend.forward(speech, speech_lengths) - feats = to_device(feats, device=self.device) - feats_len = feats_len.int() - self.asr_model.frontend = None - else: - feats = speech - feats_len = speech_lengths - lfr_factor = max(1, (feats.size()[-1] // 80) - 1) - batch = {"speech": feats, "speech_lengths": feats_len} - - # a. To device - batch = to_device(batch, device=self.device) - - # b. Forward Encoder - enc, enc_len = self.asr_model.encode(**batch) - if isinstance(enc, tuple): - enc = enc[0] - # assert len(enc) == 1, len(enc) - enc_len_batch_total = torch.sum(enc_len).item() * self.encoder_downsampling_factor - - predictor_outs = self.asr_model.calc_predictor(enc, enc_len) - pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \ - predictor_outs[2], predictor_outs[3] - pre_token_length = pre_token_length.round().long() - if torch.max(pre_token_length) < 1: - return [] - if not isinstance(self.asr_model, ContextualParaformer) and not isinstance(self.asr_model, NeatContextualParaformer): - if self.hotword_list: - logging.warning("Hotword is given but asr model is not a ContextualParaformer.") - decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length) - decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1] - else: - decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length, hw_list=self.hotword_list) - decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1] - - if isinstance(self.asr_model, BiCifParaformer): - _, _, us_alphas, us_peaks = self.asr_model.calc_predictor_timestamp(enc, enc_len, - pre_token_length) # test no bias cif2 - - results = [] - b, n, d = decoder_out.size() - for i in range(b): - x = enc[i, :enc_len[i], :] - am_scores = decoder_out[i, :pre_token_length[i], :] - if self.beam_search is not None: - nbest_hyps = self.beam_search( - x=x, am_scores=am_scores, maxlenratio=self.maxlenratio, minlenratio=self.minlenratio - ) - - nbest_hyps = nbest_hyps[: self.nbest] - else: - yseq = am_scores.argmax(dim=-1) - score = am_scores.max(dim=-1)[0] - score = torch.sum(score, dim=-1) - # pad with mask tokens to ensure compatibility with sos/eos tokens - yseq = torch.tensor( - [self.asr_model.sos] + yseq.tolist() + [self.asr_model.eos], device=yseq.device - ) - nbest_hyps = [Hypothesis(yseq=yseq, score=score)] - - for hyp in nbest_hyps: - assert isinstance(hyp, (Hypothesis)), type(hyp) - - # remove sos/eos and get results - last_pos = -1 - if isinstance(hyp.yseq, list): - token_int = hyp.yseq[1:last_pos] - else: - token_int = hyp.yseq[1:last_pos].tolist() - - # remove blank symbol id, which is assumed to be 0 - token_int = list(filter(lambda x: x != 0 and x != 2, token_int)) - - # Change integer-ids to tokens - token = self.converter.ids2tokens(token_int) - - if self.tokenizer is not None: - text = self.tokenizer.tokens2text(token) - else: - text = None - timestamp = [] - if isinstance(self.asr_model, BiCifParaformer): - _, timestamp = ts_prediction_lfr6_standard(us_alphas[i][:enc_len[i]*3], - us_peaks[i][:enc_len[i]*3], - copy.copy(token), - vad_offset=begin_time) - results.append((text, token, token_int, hyp, timestamp, enc_len_batch_total, lfr_factor)) - - - # assert check_return_type(results) - return results - - def generate_hotwords_list(self, hotword_list_or_file): - # for None - if hotword_list_or_file is None: - hotword_list = None - # for local txt inputs - elif os.path.exists(hotword_list_or_file) and hotword_list_or_file.endswith('.txt'): - logging.info("Attempting to parse hotwords from local txt...") - hotword_list = [] - hotword_str_list = [] - with codecs.open(hotword_list_or_file, 'r') as fin: - for line in fin.readlines(): - hw = line.strip() - hotword_str_list.append(hw) - hotword_list.append(self.converter.tokens2ids([i for i in hw])) - hotword_list.append([self.asr_model.sos]) - hotword_str_list.append('') - logging.info("Initialized hotword list from file: {}, hotword list: {}." - .format(hotword_list_or_file, hotword_str_list)) - # for url, download and generate txt - elif hotword_list_or_file.startswith('http'): - logging.info("Attempting to parse hotwords from url...") - work_dir = tempfile.TemporaryDirectory().name - if not os.path.exists(work_dir): - os.makedirs(work_dir) - text_file_path = os.path.join(work_dir, os.path.basename(hotword_list_or_file)) - local_file = requests.get(hotword_list_or_file) - open(text_file_path, "wb").write(local_file.content) - hotword_list_or_file = text_file_path - hotword_list = [] - hotword_str_list = [] - with codecs.open(hotword_list_or_file, 'r') as fin: - for line in fin.readlines(): - hw = line.strip() - hotword_str_list.append(hw) - hotword_list.append(self.converter.tokens2ids([i for i in hw])) - hotword_list.append([self.asr_model.sos]) - hotword_str_list.append('') - logging.info("Initialized hotword list from file: {}, hotword list: {}." - .format(hotword_list_or_file, hotword_str_list)) - # for text str input - elif not hotword_list_or_file.endswith('.txt'): - logging.info("Attempting to parse hotwords as str...") - hotword_list = [] - hotword_str_list = [] - for hw in hotword_list_or_file.strip().split(): - hotword_str_list.append(hw) - hotword_list.append(self.converter.tokens2ids([i for i in hw])) - hotword_list.append([self.asr_model.sos]) - hotword_str_list.append('') - logging.info("Hotword list: {}.".format(hotword_str_list)) - else: - hotword_list = None - return hotword_list - - - -def inference_modelscope( - maxlenratio: float, - minlenratio: float, - batch_size: int, - beam_size: int, - ngpu: int, - ctc_weight: float, - lm_weight: float, - penalty: float, - log_level: Union[int, str], - # data_path_and_name_and_type, - asr_train_config: Optional[str], - asr_model_file: Optional[str], - cmvn_file: Optional[str] = None, - lm_train_config: Optional[str] = None, - lm_file: Optional[str] = None, - token_type: Optional[str] = None, - key_file: Optional[str] = None, - word_lm_train_config: Optional[str] = None, - bpemodel: Optional[str] = None, - allow_variable_data_keys: bool = False, - dtype: str = "float32", - seed: int = 0, - ngram_weight: float = 0.9, - nbest: int = 1, - num_workers: int = 1, - output_dir: Optional[str] = None, - timestamp_infer_config: Union[Path, str] = None, - timestamp_model_file: Union[Path, str] = None, - param_dict: dict = None, - **kwargs, -): - assert check_argument_types() - ncpu = kwargs.get("ncpu", 1) - torch.set_num_threads(ncpu) - - if word_lm_train_config is not None: - raise NotImplementedError("Word LM is not implemented") - if ngpu > 1: - raise NotImplementedError("only single GPU decoding is supported") - - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - - export_mode = False - if param_dict is not None: - hotword_list_or_file = param_dict.get('hotword') - export_mode = param_dict.get("export_mode", False) - else: - hotword_list_or_file = None - - if kwargs.get("device", None) == "cpu": - ngpu = 0 - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - batch_size = 1 - - # 1. Set random-seed - set_all_random_seed(seed) - - # 2. Build speech2text - speech2text_kwargs = dict( - asr_train_config=asr_train_config, - asr_model_file=asr_model_file, - cmvn_file=cmvn_file, - lm_train_config=lm_train_config, - lm_file=lm_file, - token_type=token_type, - bpemodel=bpemodel, - device=device, - maxlenratio=maxlenratio, - minlenratio=minlenratio, - dtype=dtype, - beam_size=beam_size, - ctc_weight=ctc_weight, - lm_weight=lm_weight, - ngram_weight=ngram_weight, - penalty=penalty, - nbest=nbest, - hotword_list_or_file=hotword_list_or_file, - ) - - speech2text = Speech2Text(**speech2text_kwargs) - - if timestamp_model_file is not None: - speechtext2timestamp = SpeechText2Timestamp( - timestamp_cmvn_file=cmvn_file, - timestamp_model_file=timestamp_model_file, - timestamp_infer_config=timestamp_infer_config, - ) - else: - speechtext2timestamp = None - - def _forward( - data_path_and_name_and_type, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - output_dir_v2: Optional[str] = None, - fs: dict = None, - param_dict: dict = None, - **kwargs, - ): - - hotword_list_or_file = None - if param_dict is not None: - hotword_list_or_file = param_dict.get('hotword') - if 'hotword' in kwargs and kwargs['hotword'] is not None: - hotword_list_or_file = kwargs['hotword'] - if hotword_list_or_file is not None or 'hotword' in kwargs: - speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file) - - # 3. Build data-iterator - if data_path_and_name_and_type is None and raw_inputs is not None: - if isinstance(raw_inputs, torch.Tensor): - raw_inputs = raw_inputs.numpy() - data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] - loader = ASRTask.build_streaming_iterator( - data_path_and_name_and_type, - dtype=dtype, - fs=fs, - batch_size=batch_size, - key_file=key_file, - num_workers=num_workers, - preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), - collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), - allow_variable_data_keys=allow_variable_data_keys, - inference=True, - ) - - if param_dict is not None: - use_timestamp = param_dict.get('use_timestamp', True) - else: - use_timestamp = True - - forward_time_total = 0.0 - length_total = 0.0 - finish_count = 0 - file_count = 1 - # 7 .Start for-loop - # FIXME(kamo): The output format should be discussed about - asr_result_list = [] - output_path = output_dir_v2 if output_dir_v2 is not None else output_dir - if output_path is not None: - writer = DatadirWriter(output_path) - else: - writer = None - - for keys, batch in loader: - assert isinstance(batch, dict), type(batch) - assert all(isinstance(s, str) for s in keys), keys - _bs = len(next(iter(batch.values()))) - assert len(keys) == _bs, f"{len(keys)} != {_bs}" - # batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")} - - logging.info("decoding, utt_id: {}".format(keys)) - # N-best list of (text, token, token_int, hyp_object) - - time_beg = time.time() - results = speech2text(**batch) - if len(results) < 1: - hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) - results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest - time_end = time.time() - forward_time = time_end - time_beg - lfr_factor = results[0][-1] - length = results[0][-2] - forward_time_total += forward_time - length_total += length - rtf_cur = "decoding, feature length: {}, forward_time: {:.4f}, rtf: {:.4f}".format(length, forward_time, 100 * forward_time / (length * lfr_factor)) - logging.info(rtf_cur) - - for batch_id in range(_bs): - result = [results[batch_id][:-2]] - - key = keys[batch_id] - for n, result in zip(range(1, nbest + 1), result): - text, token, token_int, hyp = result[0], result[1], result[2], result[3] - timestamp = result[4] if len(result[4]) > 0 else None - # conduct timestamp prediction here - # timestamp inference requires token length - # thus following inference cannot be conducted in batch - if timestamp is None and speechtext2timestamp: - ts_batch = {} - ts_batch['speech'] = batch['speech'][batch_id].unsqueeze(0) - ts_batch['speech_lengths'] = torch.tensor([batch['speech_lengths'][batch_id]]) - ts_batch['text_lengths'] = torch.tensor([len(token)]) - us_alphas, us_peaks = speechtext2timestamp(**ts_batch) - ts_str, timestamp = ts_prediction_lfr6_standard(us_alphas[0], us_peaks[0], token, force_time_shift=-3.0) - # Create a directory: outdir/{n}best_recog - if writer is not None: - ibest_writer = writer[f"{n}best_recog"] - - # Write the result to each file - ibest_writer["token"][key] = " ".join(token) - # ibest_writer["token_int"][key] = " ".join(map(str, token_int)) - ibest_writer["score"][key] = str(hyp.score) - ibest_writer["rtf"][key] = rtf_cur - - if text is not None: - if use_timestamp and timestamp is not None: - postprocessed_result = postprocess_utils.sentence_postprocess(token, timestamp) - else: - postprocessed_result = postprocess_utils.sentence_postprocess(token) - timestamp_postprocessed = "" - if len(postprocessed_result) == 3: - text_postprocessed, timestamp_postprocessed, word_lists = postprocessed_result[0], \ - postprocessed_result[1], \ - postprocessed_result[2] - else: - text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1] - item = {'key': key, 'value': text_postprocessed} - if timestamp_postprocessed != "": - item['timestamp'] = timestamp_postprocessed - asr_result_list.append(item) - finish_count += 1 - # asr_utils.print_progress(finish_count / file_count) - if writer is not None: - ibest_writer["text"][key] = " ".join(word_lists) - - logging.info("decoding, utt: {}, predictions: {}".format(key, text)) - rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor)) - logging.info(rtf_avg) - if writer is not None: - ibest_writer["rtf"]["rtf_avf"] = rtf_avg - return asr_result_list - - return _forward - - -def inference_modelscope_vad_punc( - maxlenratio: float, - minlenratio: float, - batch_size: int, - beam_size: int, - ngpu: int, - ctc_weight: float, - lm_weight: float, - penalty: float, - log_level: Union[int, str], - # data_path_and_name_and_type, - asr_train_config: Optional[str], - asr_model_file: Optional[str], - cmvn_file: Optional[str] = None, - lm_train_config: Optional[str] = None, - lm_file: Optional[str] = None, - token_type: Optional[str] = None, - key_file: Optional[str] = None, - word_lm_train_config: Optional[str] = None, - bpemodel: Optional[str] = None, - allow_variable_data_keys: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - ngram_weight: float = 0.9, - nbest: int = 1, - num_workers: int = 1, - vad_infer_config: Optional[str] = None, - vad_model_file: Optional[str] = None, - vad_cmvn_file: Optional[str] = None, - time_stamp_writer: bool = True, - punc_infer_config: Optional[str] = None, - punc_model_file: Optional[str] = None, - outputs_dict: Optional[bool] = True, - param_dict: dict = None, - **kwargs, -): - assert check_argument_types() - ncpu = kwargs.get("ncpu", 1) - torch.set_num_threads(ncpu) - - if word_lm_train_config is not None: - raise NotImplementedError("Word LM is not implemented") - if ngpu > 1: - raise NotImplementedError("only single GPU decoding is supported") - - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - - if param_dict is not None: - hotword_list_or_file = param_dict.get('hotword') - else: - hotword_list_or_file = None - - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - - # 1. Set random-seed - set_all_random_seed(seed) - - # 2. Build speech2vadsegment - speech2vadsegment_kwargs = dict( - vad_infer_config=vad_infer_config, - vad_model_file=vad_model_file, - vad_cmvn_file=vad_cmvn_file, - device=device, - dtype=dtype, - ) - # logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs)) - speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs) - - # 3. Build speech2text - speech2text_kwargs = dict( - asr_train_config=asr_train_config, - asr_model_file=asr_model_file, - cmvn_file=cmvn_file, - lm_train_config=lm_train_config, - lm_file=lm_file, - token_type=token_type, - bpemodel=bpemodel, - device=device, - maxlenratio=maxlenratio, - minlenratio=minlenratio, - dtype=dtype, - beam_size=beam_size, - ctc_weight=ctc_weight, - lm_weight=lm_weight, - ngram_weight=ngram_weight, - penalty=penalty, - nbest=nbest, - hotword_list_or_file=hotword_list_or_file, - ) - speech2text = Speech2Text(**speech2text_kwargs) - text2punc = None - if punc_model_file is not None: - text2punc = Text2Punc(punc_infer_config, punc_model_file, device=device, dtype=dtype) - - if output_dir is not None: - writer = DatadirWriter(output_dir) - ibest_writer = writer[f"1best_recog"] - ibest_writer["token_list"][""] = " ".join(speech2text.asr_train_args.token_list) - - def _forward(data_path_and_name_and_type, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - output_dir_v2: Optional[str] = None, - fs: dict = None, - param_dict: dict = None, - **kwargs, - ): - - hotword_list_or_file = None - if param_dict is not None: - hotword_list_or_file = param_dict.get('hotword') - - if 'hotword' in kwargs: - hotword_list_or_file = kwargs['hotword'] - - if speech2text.hotword_list is None: - speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file) - - # 3. Build data-iterator - if data_path_and_name_and_type is None and raw_inputs is not None: - if isinstance(raw_inputs, torch.Tensor): - raw_inputs = raw_inputs.numpy() - data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] - loader = ASRTask.build_streaming_iterator( - data_path_and_name_and_type, - dtype=dtype, - fs=fs, - batch_size=1, - key_file=key_file, - num_workers=num_workers, - preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False), - collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False), - allow_variable_data_keys=allow_variable_data_keys, - inference=True, - ) - - if param_dict is not None: - use_timestamp = param_dict.get('use_timestamp', True) - else: - use_timestamp = True - - finish_count = 0 - file_count = 1 - lfr_factor = 6 - # 7 .Start for-loop - asr_result_list = [] - output_path = output_dir_v2 if output_dir_v2 is not None else output_dir - writer = None - if output_path is not None: - writer = DatadirWriter(output_path) - ibest_writer = writer[f"1best_recog"] - - for keys, batch in loader: - assert isinstance(batch, dict), type(batch) - assert all(isinstance(s, str) for s in keys), keys - _bs = len(next(iter(batch.values()))) - assert len(keys) == _bs, f"{len(keys)} != {_bs}" - - vad_results = speech2vadsegment(**batch) - _, vadsegments = vad_results[0], vad_results[1][0] - - speech, speech_lengths = batch["speech"], batch["speech_lengths"] - - n = len(vadsegments) - data_with_index = [(vadsegments[i], i) for i in range(n)] - sorted_data = sorted(data_with_index, key=lambda x: x[0][1] - x[0][0]) - results_sorted = [] - for j, beg_idx in enumerate(range(0, n, batch_size)): - end_idx = min(n, beg_idx + batch_size) - speech_j, speech_lengths_j = slice_padding_fbank(speech, speech_lengths, sorted_data[beg_idx:end_idx]) - - batch = {"speech": speech_j, "speech_lengths": speech_lengths_j} - batch = to_device(batch, device=device) - results = speech2text(**batch) - - if len(results) < 1: - results = [["", [], [], [], [], [], []]] - results_sorted.extend(results) - restored_data = [0] * n - for j in range(n): - index = sorted_data[j][1] - restored_data[index] = results_sorted[j] - result = ["", [], [], [], [], [], []] - for j in range(n): - result[0] += restored_data[j][0] - result[1] += restored_data[j][1] - result[2] += restored_data[j][2] - if len(restored_data[j][4]) > 0: - for t in restored_data[j][4]: - t[0] += vadsegments[j][0] - t[1] += vadsegments[j][0] - result[4] += restored_data[j][4] - # result = [result[k]+restored_data[j][k] for k in range(len(result[:-2]))] - - key = keys[0] - # result = result_segments[0] - text, token, token_int = result[0], result[1], result[2] - time_stamp = result[4] if len(result[4]) > 0 else None - - if use_timestamp and time_stamp is not None: - postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp) - else: - postprocessed_result = postprocess_utils.sentence_postprocess(token) - text_postprocessed = "" - time_stamp_postprocessed = "" - text_postprocessed_punc = postprocessed_result - if len(postprocessed_result) == 3: - text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \ - postprocessed_result[1], \ - postprocessed_result[2] - else: - text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1] - - text_postprocessed_punc = text_postprocessed - punc_id_list = [] - if len(word_lists) > 0 and text2punc is not None: - text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20) - - item = {'key': key, 'value': text_postprocessed_punc} - if text_postprocessed != "": - item['text_postprocessed'] = text_postprocessed - if time_stamp_postprocessed != "": - item['time_stamp'] = time_stamp_postprocessed - - item['sentences'] = time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocessed) - - asr_result_list.append(item) - finish_count += 1 - # asr_utils.print_progress(finish_count / file_count) - if writer is not None: - # Write the result to each file - ibest_writer["token"][key] = " ".join(token) - ibest_writer["token_int"][key] = " ".join(map(str, token_int)) - ibest_writer["vad"][key] = "{}".format(vadsegments) - ibest_writer["text"][key] = " ".join(word_lists) - ibest_writer["text_with_punc"][key] = text_postprocessed_punc - if time_stamp_postprocessed is not None: - ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed) - - logging.info("decoding, utt: {}, predictions: {}".format(key, text_postprocessed_punc)) - return asr_result_list - - return _forward - - -def get_parser(): - parser = config_argparse.ArgumentParser( - description="ASR Decoding", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Note(kamo): Use '_' instead of '-' as separator. - # '-' is confusing if written in yaml. - parser.add_argument( - "--log_level", - type=lambda x: x.upper(), - default="INFO", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), - help="The verbose level of logging", - ) - - parser.add_argument("--output_dir", type=str, required=True) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - parser.add_argument("--seed", type=int, default=0, help="Random seed") - parser.add_argument( - "--dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type", - ) - parser.add_argument( - "--num_workers", - type=int, - default=1, - help="The number of workers used for DataLoader", - ) - parser.add_argument( - "--hotword", - type=str_or_none, - default=None, - help="hotword file path or hotwords seperated by space" - ) - group = parser.add_argument_group("Input data related") - group.add_argument( - "--data_path_and_name_and_type", - type=str2triple_str, - required=False, - action="append", - ) - group.add_argument("--key_file", type=str_or_none) - group.add_argument("--allow_variable_data_keys", type=str2bool, default=False) - - group = parser.add_argument_group("The model configuration related") - group.add_argument( - "--asr_train_config", - type=str, - help="ASR training configuration", - ) - group.add_argument( - "--asr_model_file", - type=str, - help="ASR model parameter file", - ) - group.add_argument( - "--cmvn_file", - type=str, - help="Global cmvn file", - ) - group.add_argument( - "--lm_train_config", - type=str, - help="LM training configuration", - ) - group.add_argument( - "--lm_file", - type=str, - help="LM parameter file", - ) - group.add_argument( - "--word_lm_train_config", - type=str, - help="Word LM training configuration", - ) - group.add_argument( - "--word_lm_file", - type=str, - help="Word LM parameter file", - ) - group.add_argument( - "--ngram_file", - type=str, - help="N-gram parameter file", - ) - group.add_argument( - "--model_tag", - type=str, - help="Pretrained model tag. If specify this option, *_train_config and " - "*_file will be overwritten", - ) - - group = parser.add_argument_group("Beam-search related") - group.add_argument( - "--batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses") - group.add_argument("--beam_size", type=int, default=20, help="Beam size") - group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty") - group.add_argument( - "--maxlenratio", - type=float, - default=0.0, - help="Input length ratio to obtain max output length. " - "If maxlenratio=0.0 (default), it uses a end-detect " - "function " - "to automatically find maximum hypothesis lengths." - "If maxlenratio<0.0, its absolute value is interpreted" - "as a constant max output length", - ) - group.add_argument( - "--minlenratio", - type=float, - default=0.0, - help="Input length ratio to obtain min output length", - ) - group.add_argument( - "--ctc_weight", - type=float, - default=0.5, - help="CTC weight in joint decoding", - ) - group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight") - group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight") - group.add_argument("--streaming", type=str2bool, default=False) - - group.add_argument( - "--frontend_conf", - default=None, - help="", - ) - group.add_argument("--raw_inputs", type=list, default=None) - # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}]) - - group = parser.add_argument_group("Text converter related") - group.add_argument( - "--token_type", - type=str_or_none, - default=None, - choices=["char", "bpe", None], - help="The token type for ASR model. " - "If not given, refers from the training args", - ) - group.add_argument( - "--bpemodel", - type=str_or_none, - default=None, - help="The model path of sentencepiece. " - "If not given, refers from the training args", - ) - - return parser - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - parser = get_parser() - args = parser.parse_args(cmd) - param_dict = {'hotword': args.hotword} - kwargs = vars(args) - kwargs.pop("config", None) - kwargs['param_dict'] = param_dict - inference_pipeline = inference_modelscope(**kwargs) - return inference_pipeline(kwargs["data_path_and_name_and_type"], param_dict=param_dict) - - -if __name__ == "__main__": - main() diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py deleted file mode 100644 index 4f04d02e3..000000000 --- a/funasr/bin/asr_inference_paraformer_streaming.py +++ /dev/null @@ -1,749 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import logging -import sys -import time -import copy -import os -import codecs -import tempfile -import requests -import yaml -from pathlib import Path -from typing import Optional -from typing import Sequence -from typing import Tuple -from typing import Union -from typing import Dict -from typing import Any -from typing import List - -import numpy as np -import torch -import torchaudio -from typeguard import check_argument_types - -from funasr.fileio.datadir_writer import DatadirWriter -from funasr.modules.beam_search.beam_search import BeamSearchPara as BeamSearch -from funasr.modules.beam_search.beam_search import Hypothesis -from funasr.modules.scorers.ctc import CTCPrefixScorer -from funasr.modules.scorers.length_bonus import LengthBonus -from funasr.modules.subsampling import TooShortUttError -from funasr.tasks.asr import ASRTaskParaformer as ASRTask -from funasr.tasks.lm import LMTask -from funasr.text.build_tokenizer import build_tokenizer -from funasr.text.token_id_converter import TokenIDConverter -from funasr.torch_utils.device_funcs import to_device -from funasr.torch_utils.set_all_random_seed import set_all_random_seed -from funasr.utils import config_argparse -from funasr.utils.cli_utils import get_commandline_args -from funasr.utils.types import str2bool -from funasr.utils.types import str2triple_str -from funasr.utils.types import str_or_none -from funasr.utils import asr_utils, wav_utils, postprocess_utils -from funasr.models.frontend.wav_frontend import WavFrontend, WavFrontendOnline -from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export - -np.set_printoptions(threshold=np.inf) - - -class Speech2Text: - """Speech2Text class - - Examples: - >>> import soundfile - >>> speech2text = Speech2Text("asr_config.yml", "asr.pth") - >>> audio, rate = soundfile.read("speech.wav") - >>> speech2text(audio) - [(text, token, token_int, hypothesis object), ...] - - """ - - def __init__( - self, - asr_train_config: Union[Path, str] = None, - asr_model_file: Union[Path, str] = None, - cmvn_file: Union[Path, str] = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, - device: str = "cpu", - maxlenratio: float = 0.0, - minlenratio: float = 0.0, - dtype: str = "float32", - beam_size: int = 20, - ctc_weight: float = 0.5, - lm_weight: float = 1.0, - ngram_weight: float = 0.9, - penalty: float = 0.0, - nbest: int = 1, - frontend_conf: dict = None, - hotword_list_or_file: str = None, - **kwargs, - ): - assert check_argument_types() - - # 1. Build ASR model - scorers = {} - asr_model, asr_train_args = ASRTask.build_model_from_file( - asr_train_config, asr_model_file, cmvn_file, device - ) - frontend = None - if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None: - frontend = WavFrontendOnline(cmvn_file=cmvn_file, **asr_train_args.frontend_conf) - - logging.info("asr_model: {}".format(asr_model)) - logging.info("asr_train_args: {}".format(asr_train_args)) - asr_model.to(dtype=getattr(torch, dtype)).eval() - - if asr_model.ctc != None: - ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) - scorers.update( - ctc=ctc - ) - token_list = asr_model.token_list - scorers.update( - length_bonus=LengthBonus(len(token_list)), - ) - - # 2. Build Language model - if lm_train_config is not None: - lm, lm_train_args = LMTask.build_model_from_file( - lm_train_config, lm_file, device - ) - scorers["lm"] = lm.lm - - # 3. Build ngram model - # ngram is not supported now - ngram = None - scorers["ngram"] = ngram - - # 4. Build BeamSearch object - # transducer is not supported now - beam_search_transducer = None - - weights = dict( - decoder=1.0 - ctc_weight, - ctc=ctc_weight, - lm=lm_weight, - ngram=ngram_weight, - length_bonus=penalty, - ) - beam_search = BeamSearch( - beam_size=beam_size, - weights=weights, - scorers=scorers, - sos=asr_model.sos, - eos=asr_model.eos, - vocab_size=len(token_list), - token_list=token_list, - pre_beam_score_key=None if ctc_weight == 1.0 else "full", - ) - - beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() - for scorer in scorers.values(): - if isinstance(scorer, torch.nn.Module): - scorer.to(device=device, dtype=getattr(torch, dtype)).eval() - - logging.info(f"Decoding device={device}, dtype={dtype}") - - # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text - if token_type is None: - token_type = asr_train_args.token_type - if bpemodel is None: - bpemodel = asr_train_args.bpemodel - - if token_type is None: - tokenizer = None - elif token_type == "bpe": - if bpemodel is not None: - tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) - else: - tokenizer = None - else: - tokenizer = build_tokenizer(token_type=token_type) - converter = TokenIDConverter(token_list=token_list) - logging.info(f"Text tokenizer: {tokenizer}") - - self.asr_model = asr_model - self.asr_train_args = asr_train_args - self.converter = converter - self.tokenizer = tokenizer - - # 6. [Optional] Build hotword list from str, local file or url - - is_use_lm = lm_weight != 0.0 and lm_file is not None - if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm: - beam_search = None - self.beam_search = beam_search - logging.info(f"Beam_search: {self.beam_search}") - self.beam_search_transducer = beam_search_transducer - self.maxlenratio = maxlenratio - self.minlenratio = minlenratio - self.device = device - self.dtype = dtype - self.nbest = nbest - self.frontend = frontend - self.encoder_downsampling_factor = 1 - if asr_train_args.encoder == "data2vec_encoder" or asr_train_args.encoder_conf["input_layer"] == "conv2d": - self.encoder_downsampling_factor = 4 - - @torch.no_grad() - def __call__( - self, cache: dict, speech: Union[torch.Tensor], speech_lengths: Union[torch.Tensor] = None - ): - """Inference - - Args: - speech: Input speech data - Returns: - text, token, token_int, hyp - - """ - assert check_argument_types() - results = [] - cache_en = cache["encoder"] - if speech.shape[1] < 16 * 60 and cache_en["is_final"]: - if cache_en["start_idx"] == 0: - return [] - cache_en["tail_chunk"] = True - feats = cache_en["feats"] - feats_len = torch.tensor([feats.shape[1]]) - self.asr_model.frontend = None - results = self.infer(feats, feats_len, cache) - return results - else: - if self.frontend is not None: - feats, feats_len = self.frontend.forward(speech, speech_lengths, cache_en["is_final"]) - feats = to_device(feats, device=self.device) - feats_len = feats_len.int() - self.asr_model.frontend = None - else: - feats = speech - feats_len = speech_lengths - - if feats.shape[1] != 0: - if cache_en["is_final"]: - if feats.shape[1] + cache_en["chunk_size"][2] < cache_en["chunk_size"][1]: - cache_en["last_chunk"] = True - else: - # first chunk - feats_chunk1 = feats[:, :cache_en["chunk_size"][1], :] - feats_len = torch.tensor([feats_chunk1.shape[1]]) - results_chunk1 = self.infer(feats_chunk1, feats_len, cache) - - # last chunk - cache_en["last_chunk"] = True - feats_chunk2 = feats[:, -(feats.shape[1] + cache_en["chunk_size"][2] - cache_en["chunk_size"][1]):, :] - feats_len = torch.tensor([feats_chunk2.shape[1]]) - results_chunk2 = self.infer(feats_chunk2, feats_len, cache) - - return [" ".join(results_chunk1 + results_chunk2)] - - results = self.infer(feats, feats_len, cache) - - return results - - @torch.no_grad() - def infer(self, feats: Union[torch.Tensor], feats_len: Union[torch.Tensor], cache: List = None): - batch = {"speech": feats, "speech_lengths": feats_len} - batch = to_device(batch, device=self.device) - # b. Forward Encoder - enc, enc_len = self.asr_model.encode_chunk(feats, feats_len, cache=cache) - if isinstance(enc, tuple): - enc = enc[0] - # assert len(enc) == 1, len(enc) - enc_len_batch_total = torch.sum(enc_len).item() * self.encoder_downsampling_factor - - predictor_outs = self.asr_model.calc_predictor_chunk(enc, cache) - pre_acoustic_embeds, pre_token_length= predictor_outs[0], predictor_outs[1] - if torch.max(pre_token_length) < 1: - return [] - decoder_outs = self.asr_model.cal_decoder_with_predictor_chunk(enc, pre_acoustic_embeds, cache) - decoder_out = decoder_outs - - results = [] - b, n, d = decoder_out.size() - for i in range(b): - x = enc[i, :enc_len[i], :] - am_scores = decoder_out[i, :pre_token_length[i], :] - if self.beam_search is not None: - nbest_hyps = self.beam_search( - x=x, am_scores=am_scores, maxlenratio=self.maxlenratio, minlenratio=self.minlenratio - ) - - nbest_hyps = nbest_hyps[: self.nbest] - else: - yseq = am_scores.argmax(dim=-1) - score = am_scores.max(dim=-1)[0] - score = torch.sum(score, dim=-1) - # pad with mask tokens to ensure compatibility with sos/eos tokens - yseq = torch.tensor( - [self.asr_model.sos] + yseq.tolist() + [self.asr_model.eos], device=yseq.device - ) - nbest_hyps = [Hypothesis(yseq=yseq, score=score)] - - for hyp in nbest_hyps: - assert isinstance(hyp, (Hypothesis)), type(hyp) - - # remove sos/eos and get results - last_pos = -1 - if isinstance(hyp.yseq, list): - token_int = hyp.yseq[1:last_pos] - else: - token_int = hyp.yseq[1:last_pos].tolist() - - # remove blank symbol id, which is assumed to be 0 - token_int = list(filter(lambda x: x != 0 and x != 2, token_int)) - - # Change integer-ids to tokens - token = self.converter.ids2tokens(token_int) - token = " ".join(token) - - results.append(token) - - # assert check_return_type(results) - return results - - -def inference( - maxlenratio: float, - minlenratio: float, - batch_size: int, - beam_size: int, - ngpu: int, - ctc_weight: float, - lm_weight: float, - penalty: float, - log_level: Union[int, str], - data_path_and_name_and_type, - asr_train_config: Optional[str], - asr_model_file: Optional[str], - cmvn_file: Optional[str] = None, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - lm_train_config: Optional[str] = None, - lm_file: Optional[str] = None, - token_type: Optional[str] = None, - key_file: Optional[str] = None, - word_lm_train_config: Optional[str] = None, - bpemodel: Optional[str] = None, - allow_variable_data_keys: bool = False, - streaming: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - ngram_weight: float = 0.9, - nbest: int = 1, - num_workers: int = 1, - - **kwargs, -): - inference_pipeline = inference_modelscope( - maxlenratio=maxlenratio, - minlenratio=minlenratio, - batch_size=batch_size, - beam_size=beam_size, - ngpu=ngpu, - ctc_weight=ctc_weight, - lm_weight=lm_weight, - penalty=penalty, - log_level=log_level, - asr_train_config=asr_train_config, - asr_model_file=asr_model_file, - cmvn_file=cmvn_file, - raw_inputs=raw_inputs, - lm_train_config=lm_train_config, - lm_file=lm_file, - token_type=token_type, - key_file=key_file, - word_lm_train_config=word_lm_train_config, - bpemodel=bpemodel, - allow_variable_data_keys=allow_variable_data_keys, - streaming=streaming, - output_dir=output_dir, - dtype=dtype, - seed=seed, - ngram_weight=ngram_weight, - nbest=nbest, - num_workers=num_workers, - - **kwargs, - ) - return inference_pipeline(data_path_and_name_and_type, raw_inputs) - - -def inference_modelscope( - maxlenratio: float, - minlenratio: float, - batch_size: int, - beam_size: int, - ngpu: int, - ctc_weight: float, - lm_weight: float, - penalty: float, - log_level: Union[int, str], - # data_path_and_name_and_type, - asr_train_config: Optional[str], - asr_model_file: Optional[str], - cmvn_file: Optional[str] = None, - lm_train_config: Optional[str] = None, - lm_file: Optional[str] = None, - token_type: Optional[str] = None, - key_file: Optional[str] = None, - word_lm_train_config: Optional[str] = None, - bpemodel: Optional[str] = None, - allow_variable_data_keys: bool = False, - dtype: str = "float32", - seed: int = 0, - ngram_weight: float = 0.9, - nbest: int = 1, - num_workers: int = 1, - output_dir: Optional[str] = None, - param_dict: dict = None, - **kwargs, -): - assert check_argument_types() - - if word_lm_train_config is not None: - raise NotImplementedError("Word LM is not implemented") - if ngpu > 1: - raise NotImplementedError("only single GPU decoding is supported") - - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - - export_mode = False - - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - batch_size = 1 - - # 1. Set random-seed - set_all_random_seed(seed) - - # 2. Build speech2text - speech2text_kwargs = dict( - asr_train_config=asr_train_config, - asr_model_file=asr_model_file, - cmvn_file=cmvn_file, - lm_train_config=lm_train_config, - lm_file=lm_file, - token_type=token_type, - bpemodel=bpemodel, - device=device, - maxlenratio=maxlenratio, - minlenratio=minlenratio, - dtype=dtype, - beam_size=beam_size, - ctc_weight=ctc_weight, - lm_weight=lm_weight, - ngram_weight=ngram_weight, - penalty=penalty, - nbest=nbest, - ) - - speech2text = Speech2Text(**speech2text_kwargs) - - def _load_bytes(input): - middle_data = np.frombuffer(input, dtype=np.int16) - middle_data = np.asarray(middle_data) - if middle_data.dtype.kind not in 'iu': - raise TypeError("'middle_data' must be an array of integers") - dtype = np.dtype('float32') - if dtype.kind != 'f': - raise TypeError("'dtype' must be a floating point type") - - i = np.iinfo(middle_data.dtype) - abs_max = 2 ** (i.bits - 1) - offset = i.min + abs_max - array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32) - return array - - def _read_yaml(yaml_path: Union[str, Path]) -> Dict: - if not Path(yaml_path).exists(): - raise FileExistsError(f'The {yaml_path} does not exist.') - - with open(str(yaml_path), 'rb') as f: - data = yaml.load(f, Loader=yaml.Loader) - return data - - def _prepare_cache(cache: dict = {}, chunk_size=[5,10,5], batch_size=1): - if len(cache) > 0: - return cache - config = _read_yaml(asr_train_config) - enc_output_size = config["encoder_conf"]["output_size"] - feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"] - cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)), - "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False, - "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False} - cache["encoder"] = cache_en - - cache_de = {"decode_fsmn": None} - cache["decoder"] = cache_de - - return cache - - def _cache_reset(cache: dict = {}, chunk_size=[5,10,5], batch_size=1): - if len(cache) > 0: - config = _read_yaml(asr_train_config) - enc_output_size = config["encoder_conf"]["output_size"] - feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"] - cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)), - "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False, - "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False} - cache["encoder"] = cache_en - - cache_de = {"decode_fsmn": None} - cache["decoder"] = cache_de - - return cache - - def _forward( - data_path_and_name_and_type, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - output_dir_v2: Optional[str] = None, - fs: dict = None, - param_dict: dict = None, - **kwargs, - ): - - # 3. Build data-iterator - if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "bytes": - raw_inputs = _load_bytes(data_path_and_name_and_type[0]) - raw_inputs = torch.tensor(raw_inputs) - if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "sound": - raw_inputs = torchaudio.load(data_path_and_name_and_type[0])[0][0] - if data_path_and_name_and_type is None and raw_inputs is not None: - if isinstance(raw_inputs, np.ndarray): - raw_inputs = torch.tensor(raw_inputs) - is_final = False - cache = {} - chunk_size = [5, 10, 5] - if param_dict is not None and "cache" in param_dict: - cache = param_dict["cache"] - if param_dict is not None and "is_final" in param_dict: - is_final = param_dict["is_final"] - if param_dict is not None and "chunk_size" in param_dict: - chunk_size = param_dict["chunk_size"] - - # 7 .Start for-loop - # FIXME(kamo): The output format should be discussed about - raw_inputs = torch.unsqueeze(raw_inputs, axis=0) - asr_result_list = [] - cache = _prepare_cache(cache, chunk_size=chunk_size, batch_size=1) - item = {} - if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "sound": - sample_offset = 0 - speech_length = raw_inputs.shape[1] - stride_size = chunk_size[1] * 960 - cache = _prepare_cache(cache, chunk_size=chunk_size, batch_size=1) - final_result = "" - for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)): - if sample_offset + stride_size >= speech_length - 1: - stride_size = speech_length - sample_offset - cache["encoder"]["is_final"] = True - else: - cache["encoder"]["is_final"] = False - input_lens = torch.tensor([stride_size]) - asr_result = speech2text(cache, raw_inputs[:, sample_offset: sample_offset + stride_size], input_lens) - if len(asr_result) != 0: - final_result += " ".join(asr_result) + " " - item = {'key': "utt", 'value': final_result.strip()} - else: - input_lens = torch.tensor([raw_inputs.shape[1]]) - cache["encoder"]["is_final"] = is_final - asr_result = speech2text(cache, raw_inputs, input_lens) - item = {'key': "utt", 'value': " ".join(asr_result)} - - asr_result_list.append(item) - if is_final: - cache = _cache_reset(cache, chunk_size=chunk_size, batch_size=1) - return asr_result_list - - return _forward - - -def get_parser(): - parser = config_argparse.ArgumentParser( - description="ASR Decoding", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Note(kamo): Use '_' instead of '-' as separator. - # '-' is confusing if written in yaml. - parser.add_argument( - "--log_level", - type=lambda x: x.upper(), - default="INFO", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), - help="The verbose level of logging", - ) - - parser.add_argument("--output_dir", type=str, required=True) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - parser.add_argument("--seed", type=int, default=0, help="Random seed") - parser.add_argument( - "--dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type", - ) - parser.add_argument( - "--num_workers", - type=int, - default=1, - help="The number of workers used for DataLoader", - ) - parser.add_argument( - "--hotword", - type=str_or_none, - default=None, - help="hotword file path or hotwords seperated by space" - ) - group = parser.add_argument_group("Input data related") - group.add_argument( - "--data_path_and_name_and_type", - type=str2triple_str, - required=False, - action="append", - ) - group.add_argument("--key_file", type=str_or_none) - group.add_argument("--allow_variable_data_keys", type=str2bool, default=False) - - group = parser.add_argument_group("The model configuration related") - group.add_argument( - "--asr_train_config", - type=str, - help="ASR training configuration", - ) - group.add_argument( - "--asr_model_file", - type=str, - help="ASR model parameter file", - ) - group.add_argument( - "--cmvn_file", - type=str, - help="Global cmvn file", - ) - group.add_argument( - "--lm_train_config", - type=str, - help="LM training configuration", - ) - group.add_argument( - "--lm_file", - type=str, - help="LM parameter file", - ) - group.add_argument( - "--word_lm_train_config", - type=str, - help="Word LM training configuration", - ) - group.add_argument( - "--word_lm_file", - type=str, - help="Word LM parameter file", - ) - group.add_argument( - "--ngram_file", - type=str, - help="N-gram parameter file", - ) - group.add_argument( - "--model_tag", - type=str, - help="Pretrained model tag. If specify this option, *_train_config and " - "*_file will be overwritten", - ) - - group = parser.add_argument_group("Beam-search related") - group.add_argument( - "--batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses") - group.add_argument("--beam_size", type=int, default=20, help="Beam size") - group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty") - group.add_argument( - "--maxlenratio", - type=float, - default=0.0, - help="Input length ratio to obtain max output length. " - "If maxlenratio=0.0 (default), it uses a end-detect " - "function " - "to automatically find maximum hypothesis lengths." - "If maxlenratio<0.0, its absolute value is interpreted" - "as a constant max output length", - ) - group.add_argument( - "--minlenratio", - type=float, - default=0.0, - help="Input length ratio to obtain min output length", - ) - group.add_argument( - "--ctc_weight", - type=float, - default=0.5, - help="CTC weight in joint decoding", - ) - group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight") - group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight") - group.add_argument("--streaming", type=str2bool, default=False) - - group.add_argument( - "--frontend_conf", - default=None, - help="", - ) - group.add_argument("--raw_inputs", type=list, default=None) - # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}]) - - group = parser.add_argument_group("Text converter related") - group.add_argument( - "--token_type", - type=str_or_none, - default=None, - choices=["char", "bpe", None], - help="The token type for ASR model. " - "If not given, refers from the training args", - ) - group.add_argument( - "--bpemodel", - type=str_or_none, - default=None, - help="The model path of sentencepiece. " - "If not given, refers from the training args", - ) - - return parser - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - parser = get_parser() - args = parser.parse_args(cmd) - param_dict = {'hotword': args.hotword} - kwargs = vars(args) - kwargs.pop("config", None) - kwargs['param_dict'] = param_dict - inference(**kwargs) - - -if __name__ == "__main__": - main() - diff --git a/funasr/bin/asr_inference_rnnt.py b/funasr/bin/asr_inference_rnnt.py deleted file mode 100644 index bd36907f7..000000000 --- a/funasr/bin/asr_inference_rnnt.py +++ /dev/null @@ -1,734 +0,0 @@ -#!/usr/bin/env python3 - -""" Inference class definition for Transducer models.""" - -from __future__ import annotations - -import argparse -import logging -import math -import sys -from pathlib import Path -from typing import Any, Dict, List, Optional, Sequence, Tuple, Union - -import numpy as np -import torch -from packaging.version import parse as V -from typeguard import check_argument_types, check_return_type - -from funasr.modules.beam_search.beam_search_transducer import ( - BeamSearchTransducer, - Hypothesis, -) -from funasr.modules.nets_utils import TooShortUttError -from funasr.fileio.datadir_writer import DatadirWriter -from funasr.tasks.asr import ASRTransducerTask -from funasr.tasks.lm import LMTask -from funasr.text.build_tokenizer import build_tokenizer -from funasr.text.token_id_converter import TokenIDConverter -from funasr.torch_utils.device_funcs import to_device -from funasr.torch_utils.set_all_random_seed import set_all_random_seed -from funasr.utils import config_argparse -from funasr.utils.types import str2bool, str2triple_str, str_or_none -from funasr.utils.cli_utils import get_commandline_args -from funasr.models.frontend.wav_frontend import WavFrontend - -class Speech2Text: - """Speech2Text class for Transducer models. - Args: - asr_train_config: ASR model training config path. - asr_model_file: ASR model path. - beam_search_config: Beam search config path. - lm_train_config: Language Model training config path. - lm_file: Language Model config path. - token_type: Type of token units. - bpemodel: BPE model path. - device: Device to use for inference. - beam_size: Size of beam during search. - dtype: Data type. - lm_weight: Language model weight. - quantize_asr_model: Whether to apply dynamic quantization to ASR model. - quantize_modules: List of module names to apply dynamic quantization on. - quantize_dtype: Dynamic quantization data type. - nbest: Number of final hypothesis. - streaming: Whether to perform chunk-by-chunk inference. - chunk_size: Number of frames in chunk AFTER subsampling. - left_context: Number of frames in left context AFTER subsampling. - right_context: Number of frames in right context AFTER subsampling. - display_partial_hypotheses: Whether to display partial hypotheses. - """ - - def __init__( - self, - asr_train_config: Union[Path, str] = None, - asr_model_file: Union[Path, str] = None, - cmvn_file: Union[Path, str] = None, - beam_search_config: Dict[str, Any] = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, - device: str = "cpu", - beam_size: int = 5, - dtype: str = "float32", - lm_weight: float = 1.0, - quantize_asr_model: bool = False, - quantize_modules: List[str] = None, - quantize_dtype: str = "qint8", - nbest: int = 1, - streaming: bool = False, - simu_streaming: bool = False, - chunk_size: int = 16, - left_context: int = 32, - right_context: int = 0, - display_partial_hypotheses: bool = False, - ) -> None: - """Construct a Speech2Text object.""" - super().__init__() - - assert check_argument_types() - asr_model, asr_train_args = ASRTransducerTask.build_model_from_file( - asr_train_config, asr_model_file, cmvn_file, device - ) - - frontend = None - if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None: - frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf) - - if quantize_asr_model: - if quantize_modules is not None: - if not all([q in ["LSTM", "Linear"] for q in quantize_modules]): - raise ValueError( - "Only 'Linear' and 'LSTM' modules are currently supported" - " by PyTorch and in --quantize_modules" - ) - - q_config = set([getattr(torch.nn, q) for q in quantize_modules]) - else: - q_config = {torch.nn.Linear} - - if quantize_dtype == "float16" and (V(torch.__version__) < V("1.5.0")): - raise ValueError( - "float16 dtype for dynamic quantization is not supported with torch" - " version < 1.5.0. Switching to qint8 dtype instead." - ) - q_dtype = getattr(torch, quantize_dtype) - - asr_model = torch.quantization.quantize_dynamic( - asr_model, q_config, dtype=q_dtype - ).eval() - else: - asr_model.to(dtype=getattr(torch, dtype)).eval() - - if lm_train_config is not None: - lm, lm_train_args = LMTask.build_model_from_file( - lm_train_config, lm_file, device - ) - lm_scorer = lm.lm - else: - lm_scorer = None - - # 4. Build BeamSearch object - if beam_search_config is None: - beam_search_config = {} - - beam_search = BeamSearchTransducer( - asr_model.decoder, - asr_model.joint_network, - beam_size, - lm=lm_scorer, - lm_weight=lm_weight, - nbest=nbest, - **beam_search_config, - ) - - token_list = asr_model.token_list - - if token_type is None: - token_type = asr_train_args.token_type - if bpemodel is None: - bpemodel = asr_train_args.bpemodel - - if token_type is None: - tokenizer = None - elif token_type == "bpe": - if bpemodel is not None: - tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) - else: - tokenizer = None - else: - tokenizer = build_tokenizer(token_type=token_type) - converter = TokenIDConverter(token_list=token_list) - logging.info(f"Text tokenizer: {tokenizer}") - - self.asr_model = asr_model - self.asr_train_args = asr_train_args - self.device = device - self.dtype = dtype - self.nbest = nbest - - self.converter = converter - self.tokenizer = tokenizer - - self.beam_search = beam_search - self.streaming = streaming - self.simu_streaming = simu_streaming - self.chunk_size = max(chunk_size, 0) - self.left_context = left_context - self.right_context = max(right_context, 0) - - if not streaming or chunk_size == 0: - self.streaming = False - self.asr_model.encoder.dynamic_chunk_training = False - - if not simu_streaming or chunk_size == 0: - self.simu_streaming = False - self.asr_model.encoder.dynamic_chunk_training = False - - self.frontend = frontend - self.window_size = self.chunk_size + self.right_context - - if self.streaming: - self._ctx = self.asr_model.encoder.get_encoder_input_size( - self.window_size - ) - - self.last_chunk_length = ( - self.asr_model.encoder.embed.min_frame_length + self.right_context + 1 - ) - self.reset_inference_cache() - - def reset_inference_cache(self) -> None: - """Reset Speech2Text parameters.""" - self.frontend_cache = None - - self.asr_model.encoder.reset_streaming_cache( - self.left_context, device=self.device - ) - self.beam_search.reset_inference_cache() - - self.num_processed_frames = torch.tensor([[0]], device=self.device) - - @torch.no_grad() - def streaming_decode( - self, - speech: Union[torch.Tensor, np.ndarray], - is_final: bool = True, - ) -> List[Hypothesis]: - """Speech2Text streaming call. - Args: - speech: Chunk of speech data. (S) - is_final: Whether speech corresponds to the final chunk of data. - Returns: - nbest_hypothesis: N-best hypothesis. - """ - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - if is_final: - if self.streaming and speech.size(0) < self.last_chunk_length: - pad = torch.zeros( - self.last_chunk_length - speech.size(0), speech.size(1), dtype=speech.dtype - ) - speech = torch.cat([speech, pad], dim=0) #feats, feats_length = self.apply_frontend(speech, is_final=is_final) - - feats = speech.unsqueeze(0).to(getattr(torch, self.dtype)) - feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1)) - - if self.asr_model.normalize is not None: - feats, feats_lengths = self.asr_model.normalize(feats, feats_lengths) - - feats = to_device(feats, device=self.device) - feats_lengths = to_device(feats_lengths, device=self.device) - enc_out = self.asr_model.encoder.chunk_forward( - feats, - feats_lengths, - self.num_processed_frames, - chunk_size=self.chunk_size, - left_context=self.left_context, - right_context=self.right_context, - ) - nbest_hyps = self.beam_search(enc_out[0], is_final=is_final) - - self.num_processed_frames += self.chunk_size - - if is_final: - self.reset_inference_cache() - - return nbest_hyps - - @torch.no_grad() - def simu_streaming_decode(self, speech: Union[torch.Tensor, np.ndarray]) -> List[Hypothesis]: - """Speech2Text call. - Args: - speech: Speech data. (S) - Returns: - nbest_hypothesis: N-best hypothesis. - """ - assert check_argument_types() - - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - - feats = speech.unsqueeze(0).to(getattr(torch, self.dtype)) - feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1)) - - if self.asr_model.normalize is not None: - feats, feats_lengths = self.asr_model.normalize(feats, feats_lengths) - - feats = to_device(feats, device=self.device) - feats_lengths = to_device(feats_lengths, device=self.device) - enc_out = self.asr_model.encoder.simu_chunk_forward(feats, feats_lengths, self.chunk_size, self.left_context, self.right_context) - nbest_hyps = self.beam_search(enc_out[0]) - - return nbest_hyps - - @torch.no_grad() - def __call__(self, speech: Union[torch.Tensor, np.ndarray]) -> List[Hypothesis]: - """Speech2Text call. - Args: - speech: Speech data. (S) - Returns: - nbest_hypothesis: N-best hypothesis. - """ - assert check_argument_types() - - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - - feats = speech.unsqueeze(0).to(getattr(torch, self.dtype)) - feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1)) - - feats = to_device(feats, device=self.device) - feats_lengths = to_device(feats_lengths, device=self.device) - - enc_out, _ = self.asr_model.encoder(feats, feats_lengths) - - nbest_hyps = self.beam_search(enc_out[0]) - - return nbest_hyps - - def hypotheses_to_results(self, nbest_hyps: List[Hypothesis]) -> List[Any]: - """Build partial or final results from the hypotheses. - Args: - nbest_hyps: N-best hypothesis. - Returns: - results: Results containing different representation for the hypothesis. - """ - results = [] - - for hyp in nbest_hyps: - token_int = list(filter(lambda x: x != 0, hyp.yseq)) - - token = self.converter.ids2tokens(token_int) - - if self.tokenizer is not None: - text = self.tokenizer.tokens2text(token) - else: - text = None - results.append((text, token, token_int, hyp)) - - assert check_return_type(results) - - return results - - @staticmethod - def from_pretrained( - model_tag: Optional[str] = None, - **kwargs: Optional[Any], - ) -> Speech2Text: - """Build Speech2Text instance from the pretrained model. - Args: - model_tag: Model tag of the pretrained models. - Return: - : Speech2Text instance. - """ - if model_tag is not None: - try: - from espnet_model_zoo.downloader import ModelDownloader - - except ImportError: - logging.error( - "`espnet_model_zoo` is not installed. " - "Please install via `pip install -U espnet_model_zoo`." - ) - raise - d = ModelDownloader() - kwargs.update(**d.download_and_unpack(model_tag)) - - return Speech2Text(**kwargs) - - -def inference( - output_dir: str, - batch_size: int, - dtype: str, - beam_size: int, - ngpu: int, - seed: int, - lm_weight: float, - nbest: int, - num_workers: int, - log_level: Union[int, str], - data_path_and_name_and_type: Sequence[Tuple[str, str, str]], - asr_train_config: Optional[str], - asr_model_file: Optional[str], - cmvn_file: Optional[str], - beam_search_config: Optional[dict], - lm_train_config: Optional[str], - lm_file: Optional[str], - model_tag: Optional[str], - token_type: Optional[str], - bpemodel: Optional[str], - key_file: Optional[str], - allow_variable_data_keys: bool, - quantize_asr_model: Optional[bool], - quantize_modules: Optional[List[str]], - quantize_dtype: Optional[str], - streaming: Optional[bool], - simu_streaming: Optional[bool], - chunk_size: Optional[int], - left_context: Optional[int], - right_context: Optional[int], - display_partial_hypotheses: bool, - **kwargs, -) -> None: - """Transducer model inference. - Args: - output_dir: Output directory path. - batch_size: Batch decoding size. - dtype: Data type. - beam_size: Beam size. - ngpu: Number of GPUs. - seed: Random number generator seed. - lm_weight: Weight of language model. - nbest: Number of final hypothesis. - num_workers: Number of workers. - log_level: Level of verbose for logs. - data_path_and_name_and_type: - asr_train_config: ASR model training config path. - asr_model_file: ASR model path. - beam_search_config: Beam search config path. - lm_train_config: Language Model training config path. - lm_file: Language Model path. - model_tag: Model tag. - token_type: Type of token units. - bpemodel: BPE model path. - key_file: File key. - allow_variable_data_keys: Whether to allow variable data keys. - quantize_asr_model: Whether to apply dynamic quantization to ASR model. - quantize_modules: List of module names to apply dynamic quantization on. - quantize_dtype: Dynamic quantization data type. - streaming: Whether to perform chunk-by-chunk inference. - chunk_size: Number of frames in chunk AFTER subsampling. - left_context: Number of frames in left context AFTER subsampling. - right_context: Number of frames in right context AFTER subsampling. - display_partial_hypotheses: Whether to display partial hypotheses. - """ - assert check_argument_types() - - if batch_size > 1: - raise NotImplementedError("batch decoding is not implemented") - if ngpu > 1: - raise NotImplementedError("only single GPU decoding is supported") - - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - - if ngpu >= 1: - device = "cuda" - else: - device = "cpu" - # 1. Set random-seed - set_all_random_seed(seed) - - # 2. Build speech2text - speech2text_kwargs = dict( - asr_train_config=asr_train_config, - asr_model_file=asr_model_file, - cmvn_file=cmvn_file, - beam_search_config=beam_search_config, - lm_train_config=lm_train_config, - lm_file=lm_file, - token_type=token_type, - bpemodel=bpemodel, - device=device, - dtype=dtype, - beam_size=beam_size, - lm_weight=lm_weight, - nbest=nbest, - quantize_asr_model=quantize_asr_model, - quantize_modules=quantize_modules, - quantize_dtype=quantize_dtype, - streaming=streaming, - simu_streaming=simu_streaming, - chunk_size=chunk_size, - left_context=left_context, - right_context=right_context, - ) - speech2text = Speech2Text.from_pretrained( - model_tag=model_tag, - **speech2text_kwargs, - ) - - # 3. Build data-iterator - loader = ASRTransducerTask.build_streaming_iterator( - data_path_and_name_and_type, - dtype=dtype, - batch_size=batch_size, - key_file=key_file, - num_workers=num_workers, - preprocess_fn=ASRTransducerTask.build_preprocess_fn( - speech2text.asr_train_args, False - ), - collate_fn=ASRTransducerTask.build_collate_fn( - speech2text.asr_train_args, False - ), - allow_variable_data_keys=allow_variable_data_keys, - inference=True, - ) - - # 4 .Start for-loop - with DatadirWriter(output_dir) as writer: - for keys, batch in loader: - assert isinstance(batch, dict), type(batch) - assert all(isinstance(s, str) for s in keys), keys - - _bs = len(next(iter(batch.values()))) - assert len(keys) == _bs, f"{len(keys)} != {_bs}" - batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} - assert len(batch.keys()) == 1 - - try: - if speech2text.streaming: - speech = batch["speech"] - - _steps = len(speech) // speech2text._ctx - _end = 0 - for i in range(_steps): - _end = (i + 1) * speech2text._ctx - - speech2text.streaming_decode( - speech[i * speech2text._ctx : _end], is_final=False - ) - - final_hyps = speech2text.streaming_decode( - speech[_end : len(speech)], is_final=True - ) - elif speech2text.simu_streaming: - final_hyps = speech2text.simu_streaming_decode(**batch) - else: - final_hyps = speech2text(**batch) - - results = speech2text.hypotheses_to_results(final_hyps) - except TooShortUttError as e: - logging.warning(f"Utterance {keys} {e}") - hyp = Hypothesis(score=0.0, yseq=[], dec_state=None) - results = [[" ", [""], [2], hyp]] * nbest - - key = keys[0] - for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results): - ibest_writer = writer[f"{n}best_recog"] - - ibest_writer["token"][key] = " ".join(token) - ibest_writer["token_int"][key] = " ".join(map(str, token_int)) - ibest_writer["score"][key] = str(hyp.score) - - if text is not None: - ibest_writer["text"][key] = text - - -def get_parser(): - """Get Transducer model inference parser.""" - - parser = config_argparse.ArgumentParser( - description="ASR Transducer Decoding", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - parser.add_argument( - "--log_level", - type=lambda x: x.upper(), - default="INFO", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), - help="The verbose level of logging", - ) - - parser.add_argument("--output_dir", type=str, required=True) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - parser.add_argument("--seed", type=int, default=0, help="Random seed") - parser.add_argument( - "--dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type", - ) - parser.add_argument( - "--num_workers", - type=int, - default=1, - help="The number of workers used for DataLoader", - ) - - group = parser.add_argument_group("Input data related") - group.add_argument( - "--data_path_and_name_and_type", - type=str2triple_str, - required=True, - action="append", - ) - group.add_argument("--key_file", type=str_or_none) - group.add_argument("--allow_variable_data_keys", type=str2bool, default=False) - - group = parser.add_argument_group("The model configuration related") - group.add_argument( - "--asr_train_config", - type=str, - help="ASR training configuration", - ) - group.add_argument( - "--asr_model_file", - type=str, - help="ASR model parameter file", - ) - group.add_argument( - "--cmvn_file", - type=str, - help="Global cmvn file", - ) - group.add_argument( - "--lm_train_config", - type=str, - help="LM training configuration", - ) - group.add_argument( - "--lm_file", - type=str, - help="LM parameter file", - ) - group.add_argument( - "--model_tag", - type=str, - help="Pretrained model tag. If specify this option, *_train_config and " - "*_file will be overwritten", - ) - - group = parser.add_argument_group("Beam-search related") - group.add_argument( - "--batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses") - group.add_argument("--beam_size", type=int, default=5, help="Beam size") - group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight") - group.add_argument( - "--beam_search_config", - default={}, - help="The keyword arguments for transducer beam search.", - ) - - group = parser.add_argument_group("Text converter related") - group.add_argument( - "--token_type", - type=str_or_none, - default=None, - choices=["char", "bpe", None], - help="The token type for ASR model. " - "If not given, refers from the training args", - ) - group.add_argument( - "--bpemodel", - type=str_or_none, - default=None, - help="The model path of sentencepiece. " - "If not given, refers from the training args", - ) - - group = parser.add_argument_group("Dynamic quantization related") - parser.add_argument( - "--quantize_asr_model", - type=bool, - default=False, - help="Apply dynamic quantization to ASR model.", - ) - parser.add_argument( - "--quantize_modules", - nargs="*", - default=None, - help="""Module names to apply dynamic quantization on. - The module names are provided as a list, where each name is separated - by a comma (e.g.: --quantize-config=[Linear,LSTM,GRU]). - Each specified name should be an attribute of 'torch.nn', e.g.: - torch.nn.Linear, torch.nn.LSTM, torch.nn.GRU, ...""", - ) - parser.add_argument( - "--quantize_dtype", - type=str, - default="qint8", - choices=["float16", "qint8"], - help="Dtype for dynamic quantization.", - ) - - group = parser.add_argument_group("Streaming related") - parser.add_argument( - "--streaming", - type=bool, - default=False, - help="Whether to perform chunk-by-chunk inference.", - ) - parser.add_argument( - "--simu_streaming", - type=bool, - default=False, - help="Whether to simulate chunk-by-chunk inference.", - ) - parser.add_argument( - "--chunk_size", - type=int, - default=16, - help="Number of frames in chunk AFTER subsampling.", - ) - parser.add_argument( - "--left_context", - type=int, - default=32, - help="Number of frames in left context of the chunk AFTER subsampling.", - ) - parser.add_argument( - "--right_context", - type=int, - default=0, - help="Number of frames in right context of the chunk AFTER subsampling.", - ) - parser.add_argument( - "--display_partial_hypotheses", - type=bool, - default=False, - help="Whether to display partial hypotheses during chunk-by-chunk inference.", - ) - - return parser - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - - parser = get_parser() - args = parser.parse_args(cmd) - kwargs = vars(args) - - kwargs.pop("config", None) - inference(**kwargs) - - -if __name__ == "__main__": - main() - diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py deleted file mode 100644 index 35ecdc24b..000000000 --- a/funasr/bin/asr_inference_uniasr.py +++ /dev/null @@ -1,694 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import logging -import sys -from pathlib import Path -from typing import List -from typing import Optional -from typing import Sequence -from typing import Tuple -from typing import Union -from typing import Dict -from typing import Any - -import numpy as np -import torch -from typeguard import check_argument_types -from typeguard import check_return_type - -from funasr.fileio.datadir_writer import DatadirWriter -from funasr.modules.beam_search.beam_search import BeamSearchScama as BeamSearch -from funasr.modules.beam_search.beam_search import Hypothesis -from funasr.modules.scorers.ctc import CTCPrefixScorer -from funasr.modules.scorers.length_bonus import LengthBonus -from funasr.modules.subsampling import TooShortUttError -from funasr.tasks.asr import ASRTaskUniASR as ASRTask -from funasr.tasks.lm import LMTask -from funasr.text.build_tokenizer import build_tokenizer -from funasr.text.token_id_converter import TokenIDConverter -from funasr.torch_utils.device_funcs import to_device -from funasr.torch_utils.set_all_random_seed import set_all_random_seed -from funasr.utils import config_argparse -from funasr.utils.cli_utils import get_commandline_args -from funasr.utils.types import str2bool -from funasr.utils.types import str2triple_str -from funasr.utils.types import str_or_none -from funasr.utils import asr_utils, wav_utils, postprocess_utils -from funasr.models.frontend.wav_frontend import WavFrontend - - - -class Speech2Text: - """Speech2Text class - - Examples: - >>> import soundfile - >>> speech2text = Speech2Text("asr_config.yml", "asr.pb") - >>> audio, rate = soundfile.read("speech.wav") - >>> speech2text(audio) - [(text, token, token_int, hypothesis object), ...] - - """ - - def __init__( - self, - asr_train_config: Union[Path, str] = None, - asr_model_file: Union[Path, str] = None, - cmvn_file: Union[Path, str] = None, - lm_train_config: Union[Path, str] = None, - lm_file: Union[Path, str] = None, - token_type: str = None, - bpemodel: str = None, - device: str = "cpu", - maxlenratio: float = 0.0, - minlenratio: float = 0.0, - dtype: str = "float32", - beam_size: int = 20, - ctc_weight: float = 0.5, - lm_weight: float = 1.0, - ngram_weight: float = 0.9, - penalty: float = 0.0, - nbest: int = 1, - token_num_relax: int = 1, - decoding_ind: int = 0, - decoding_mode: str = "model1", - frontend_conf: dict = None, - **kwargs, - ): - assert check_argument_types() - - # 1. Build ASR model - scorers = {} - asr_model, asr_train_args = ASRTask.build_model_from_file( - asr_train_config, asr_model_file, cmvn_file, device - ) - frontend = None - if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None: - frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf) - - logging.info("asr_train_args: {}".format(asr_train_args)) - asr_model.to(dtype=getattr(torch, dtype)).eval() - if decoding_mode == "model1": - decoder = asr_model.decoder - else: - decoder = asr_model.decoder2 - - if asr_model.ctc != None: - ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos) - scorers.update( - ctc=ctc - ) - token_list = asr_model.token_list - scorers.update( - decoder=decoder, - length_bonus=LengthBonus(len(token_list)), - ) - - # 2. Build Language model - if lm_train_config is not None: - lm, lm_train_args = LMTask.build_model_from_file( - lm_train_config, lm_file, device - ) - scorers["lm"] = lm.lm - - # 3. Build ngram model - # ngram is not supported now - ngram = None - scorers["ngram"] = ngram - - # 4. Build BeamSearch object - # transducer is not supported now - beam_search_transducer = None - - weights = dict( - decoder=1.0 - ctc_weight, - ctc=ctc_weight, - lm=lm_weight, - ngram=ngram_weight, - length_bonus=penalty, - ) - beam_search = BeamSearch( - beam_size=beam_size, - weights=weights, - scorers=scorers, - sos=asr_model.sos, - eos=asr_model.eos, - vocab_size=len(token_list), - token_list=token_list, - pre_beam_score_key=None if ctc_weight == 1.0 else "full", - ) - - beam_search.to(device=device, dtype=getattr(torch, dtype)).eval() - for scorer in scorers.values(): - if isinstance(scorer, torch.nn.Module): - scorer.to(device=device, dtype=getattr(torch, dtype)).eval() - # logging.info(f"Beam_search: {beam_search}") - logging.info(f"Decoding device={device}, dtype={dtype}") - - # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text - if token_type is None: - token_type = asr_train_args.token_type - if bpemodel is None: - bpemodel = asr_train_args.bpemodel - - if token_type is None: - tokenizer = None - elif token_type == "bpe": - if bpemodel is not None: - tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel) - else: - tokenizer = None - else: - tokenizer = build_tokenizer(token_type=token_type) - converter = TokenIDConverter(token_list=token_list) - logging.info(f"Text tokenizer: {tokenizer}") - - self.asr_model = asr_model - self.asr_train_args = asr_train_args - self.converter = converter - self.tokenizer = tokenizer - self.beam_search = beam_search - self.beam_search_transducer = beam_search_transducer - self.maxlenratio = maxlenratio - self.minlenratio = minlenratio - self.device = device - self.dtype = dtype - self.nbest = nbest - self.token_num_relax = token_num_relax - self.decoding_ind = decoding_ind - self.decoding_mode = decoding_mode - self.frontend = frontend - - @torch.no_grad() - def __call__( - self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None - ) -> List[ - Tuple[ - Optional[str], - List[str], - List[int], - Union[Hypothesis], - ] - ]: - """Inference - - Args: - speech: Input speech data - Returns: - text, token, token_int, hyp - - """ - assert check_argument_types() - - # Input as audio signal - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - - if self.frontend is not None: - feats, feats_len = self.frontend.forward(speech, speech_lengths) - feats = to_device(feats, device=self.device) - feats_len = feats_len.int() - self.asr_model.frontend = None - else: - feats = speech - feats_len = speech_lengths - lfr_factor = max(1, (feats.size()[-1] // 80) - 1) - feats_raw = feats.clone().to(self.device) - batch = {"speech": feats, "speech_lengths": feats_len} - - # a. To device - batch = to_device(batch, device=self.device) - # b. Forward Encoder - _, enc, enc_len = self.asr_model.encode(**batch, ind=self.decoding_ind) - if isinstance(enc, tuple): - enc = enc[0] - assert len(enc) == 1, len(enc) - if self.decoding_mode == "model1": - predictor_outs = self.asr_model.calc_predictor_mask(enc, enc_len) - else: - enc, enc_len = self.asr_model.encode2(enc, enc_len, feats_raw, feats_len, ind=self.decoding_ind) - predictor_outs = self.asr_model.calc_predictor_mask2(enc, enc_len) - - scama_mask = predictor_outs[4] - pre_token_length = predictor_outs[1] - pre_acoustic_embeds = predictor_outs[0] - maxlen = pre_token_length.sum().item() + self.token_num_relax - minlen = max(0, pre_token_length.sum().item() - self.token_num_relax) - # c. Passed the encoder result and the beam search - nbest_hyps = self.beam_search( - x=enc[0], scama_mask=scama_mask, pre_acoustic_embeds=pre_acoustic_embeds, maxlenratio=self.maxlenratio, - minlenratio=self.minlenratio, maxlen=int(maxlen), minlen=int(minlen), - ) - - nbest_hyps = nbest_hyps[: self.nbest] - - results = [] - for hyp in nbest_hyps: - assert isinstance(hyp, (Hypothesis)), type(hyp) - - # remove sos/eos and get results - last_pos = -1 - if isinstance(hyp.yseq, list): - token_int = hyp.yseq[1:last_pos] - else: - token_int = hyp.yseq[1:last_pos].tolist() - - # remove blank symbol id, which is assumed to be 0 - token_int = list(filter(lambda x: x != 0, token_int)) - - # Change integer-ids to tokens - token = self.converter.ids2tokens(token_int) - token = list(filter(lambda x: x != "", token)) - - if self.tokenizer is not None: - text = self.tokenizer.tokens2text(token) - else: - text = None - results.append((text, token, token_int, hyp)) - - assert check_return_type(results) - return results - - -def inference( - maxlenratio: float, - minlenratio: float, - batch_size: int, - beam_size: int, - ngpu: int, - ctc_weight: float, - lm_weight: float, - penalty: float, - log_level: Union[int, str], - data_path_and_name_and_type, - asr_train_config: Optional[str], - asr_model_file: Optional[str], - ngram_file: Optional[str] = None, - cmvn_file: Optional[str] = None, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - lm_train_config: Optional[str] = None, - lm_file: Optional[str] = None, - token_type: Optional[str] = None, - key_file: Optional[str] = None, - word_lm_train_config: Optional[str] = None, - bpemodel: Optional[str] = None, - allow_variable_data_keys: bool = False, - streaming: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - ngram_weight: float = 0.9, - nbest: int = 1, - num_workers: int = 1, - token_num_relax: int = 1, - decoding_ind: int = 0, - decoding_mode: str = "model1", - **kwargs, -): - inference_pipeline = inference_modelscope( - maxlenratio=maxlenratio, - minlenratio=minlenratio, - batch_size=batch_size, - beam_size=beam_size, - ngpu=ngpu, - ctc_weight=ctc_weight, - lm_weight=lm_weight, - penalty=penalty, - log_level=log_level, - asr_train_config=asr_train_config, - asr_model_file=asr_model_file, - cmvn_file=cmvn_file, - raw_inputs=raw_inputs, - lm_train_config=lm_train_config, - lm_file=lm_file, - token_type=token_type, - key_file=key_file, - word_lm_train_config=word_lm_train_config, - bpemodel=bpemodel, - allow_variable_data_keys=allow_variable_data_keys, - streaming=streaming, - output_dir=output_dir, - dtype=dtype, - seed=seed, - ngram_weight=ngram_weight, - ngram_file=ngram_file, - nbest=nbest, - num_workers=num_workers, - token_num_relax=token_num_relax, - decoding_ind=decoding_ind, - decoding_mode=decoding_mode, - **kwargs, - ) - return inference_pipeline(data_path_and_name_and_type, raw_inputs) - - -def inference_modelscope( - maxlenratio: float, - minlenratio: float, - batch_size: int, - beam_size: int, - ngpu: int, - ctc_weight: float, - lm_weight: float, - penalty: float, - log_level: Union[int, str], - # data_path_and_name_and_type, - asr_train_config: Optional[str], - asr_model_file: Optional[str], - ngram_file: Optional[str] = None, - cmvn_file: Optional[str] = None, - # raw_inputs: Union[np.ndarray, torch.Tensor] = None, - lm_train_config: Optional[str] = None, - lm_file: Optional[str] = None, - token_type: Optional[str] = None, - key_file: Optional[str] = None, - word_lm_train_config: Optional[str] = None, - bpemodel: Optional[str] = None, - allow_variable_data_keys: bool = False, - streaming: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - ngram_weight: float = 0.9, - nbest: int = 1, - num_workers: int = 1, - token_num_relax: int = 1, - decoding_ind: int = 0, - decoding_mode: str = "model1", - param_dict: dict = None, - **kwargs, -): - assert check_argument_types() - ncpu = kwargs.get("ncpu", 1) - torch.set_num_threads(ncpu) - if batch_size > 1: - raise NotImplementedError("batch decoding is not implemented") - if word_lm_train_config is not None: - raise NotImplementedError("Word LM is not implemented") - if ngpu > 1: - raise NotImplementedError("only single GPU decoding is supported") - - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - - if param_dict is not None and "decoding_model" in param_dict: - if param_dict["decoding_model"] == "fast": - decoding_ind = 0 - decoding_mode = "model1" - elif param_dict["decoding_model"] == "normal": - decoding_ind = 0 - decoding_mode = "model2" - elif param_dict["decoding_model"] == "offline": - decoding_ind = 1 - decoding_mode = "model2" - else: - raise NotImplementedError("unsupported decoding model {}".format(param_dict["decoding_model"])) - - # 1. Set random-seed - set_all_random_seed(seed) - - # 2. Build speech2text - speech2text_kwargs = dict( - asr_train_config=asr_train_config, - asr_model_file=asr_model_file, - cmvn_file=cmvn_file, - lm_train_config=lm_train_config, - lm_file=lm_file, - ngram_file=ngram_file, - token_type=token_type, - bpemodel=bpemodel, - device=device, - maxlenratio=maxlenratio, - minlenratio=minlenratio, - dtype=dtype, - beam_size=beam_size, - ctc_weight=ctc_weight, - lm_weight=lm_weight, - ngram_weight=ngram_weight, - penalty=penalty, - nbest=nbest, - streaming=streaming, - token_num_relax=token_num_relax, - decoding_ind=decoding_ind, - decoding_mode=decoding_mode, - ) - speech2text = Speech2Text(**speech2text_kwargs) - - def _forward(data_path_and_name_and_type, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - output_dir_v2: Optional[str] = None, - fs: dict = None, - param_dict: dict = None, - **kwargs, - ): - # 3. Build data-iterator - if data_path_and_name_and_type is None and raw_inputs is not None: - if isinstance(raw_inputs, torch.Tensor): - raw_inputs = raw_inputs.numpy() - data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] - loader = ASRTask.build_streaming_iterator( - data_path_and_name_and_type, - dtype=dtype, - fs=fs, - batch_size=batch_size, - key_file=key_file, - num_workers=num_workers, - preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False), - collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False), - allow_variable_data_keys=allow_variable_data_keys, - inference=True, - ) - - finish_count = 0 - file_count = 1 - # 7 .Start for-loop - # FIXME(kamo): The output format should be discussed about - asr_result_list = [] - output_path = output_dir_v2 if output_dir_v2 is not None else output_dir - if output_path is not None: - writer = DatadirWriter(output_path) - else: - writer = None - - for keys, batch in loader: - assert isinstance(batch, dict), type(batch) - assert all(isinstance(s, str) for s in keys), keys - _bs = len(next(iter(batch.values()))) - assert len(keys) == _bs, f"{len(keys)} != {_bs}" - #batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} - - # N-best list of (text, token, token_int, hyp_object) - try: - results = speech2text(**batch) - except TooShortUttError as e: - logging.warning(f"Utterance {keys} {e}") - hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) - results = [[" ", ["sil"], [2], hyp]] * nbest - - # Only supporting batch_size==1 - key = keys[0] - logging.info(f"Utterance: {key}") - for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results): - # Create a directory: outdir/{n}best_recog - if writer is not None: - ibest_writer = writer[f"{n}best_recog"] - - # Write the result to each file - ibest_writer["token"][key] = " ".join(token) - # ibest_writer["token_int"][key] = " ".join(map(str, token_int)) - ibest_writer["score"][key] = str(hyp.score) - - if text is not None: - text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token) - item = {'key': key, 'value': text_postprocessed} - asr_result_list.append(item) - finish_count += 1 - asr_utils.print_progress(finish_count / file_count) - if writer is not None: - ibest_writer["text"][key] = " ".join(word_lists) - return asr_result_list - - return _forward - - - -def get_parser(): - parser = config_argparse.ArgumentParser( - description="ASR Decoding", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Note(kamo): Use '_' instead of '-' as separator. - # '-' is confusing if written in yaml. - parser.add_argument( - "--log_level", - type=lambda x: x.upper(), - default="INFO", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), - help="The verbose level of logging", - ) - - parser.add_argument("--output_dir", type=str, required=True) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - parser.add_argument("--seed", type=int, default=0, help="Random seed") - parser.add_argument( - "--dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type", - ) - parser.add_argument( - "--num_workers", - type=int, - default=1, - help="The number of workers used for DataLoader", - ) - - group = parser.add_argument_group("Input data related") - group.add_argument( - "--data_path_and_name_and_type", - type=str2triple_str, - required=False, - action="append", - ) - group.add_argument("--raw_inputs", type=list, default=None) - # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}]) - group.add_argument("--key_file", type=str_or_none) - group.add_argument("--allow_variable_data_keys", type=str2bool, default=False) - - group = parser.add_argument_group("The model configuration related") - group.add_argument( - "--asr_train_config", - type=str, - help="ASR training configuration", - ) - group.add_argument( - "--asr_model_file", - type=str, - help="ASR model parameter file", - ) - group.add_argument( - "--cmvn_file", - type=str, - help="Global cmvn file", - ) - group.add_argument( - "--lm_train_config", - type=str, - help="LM training configuration", - ) - group.add_argument( - "--lm_file", - type=str, - help="LM parameter file", - ) - group.add_argument( - "--word_lm_train_config", - type=str, - help="Word LM training configuration", - ) - group.add_argument( - "--word_lm_file", - type=str, - help="Word LM parameter file", - ) - group.add_argument( - "--ngram_file", - type=str, - help="N-gram parameter file", - ) - group.add_argument( - "--model_tag", - type=str, - help="Pretrained model tag. If specify this option, *_train_config and " - "*_file will be overwritten", - ) - - group = parser.add_argument_group("Beam-search related") - group.add_argument( - "--batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses") - group.add_argument("--beam_size", type=int, default=20, help="Beam size") - group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty") - group.add_argument( - "--maxlenratio", - type=float, - default=0.0, - help="Input length ratio to obtain max output length. " - "If maxlenratio=0.0 (default), it uses a end-detect " - "function " - "to automatically find maximum hypothesis lengths." - "If maxlenratio<0.0, its absolute value is interpreted" - "as a constant max output length", - ) - group.add_argument( - "--minlenratio", - type=float, - default=0.0, - help="Input length ratio to obtain min output length", - ) - group.add_argument( - "--ctc_weight", - type=float, - default=0.5, - help="CTC weight in joint decoding", - ) - group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight") - group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight") - group.add_argument("--streaming", type=str2bool, default=False) - - group = parser.add_argument_group("Text converter related") - group.add_argument( - "--token_type", - type=str_or_none, - default=None, - choices=["char", "bpe", None], - help="The token type for ASR model. " - "If not given, refers from the training args", - ) - group.add_argument( - "--bpemodel", - type=str_or_none, - default=None, - help="The model path of sentencepiece. " - "If not given, refers from the training args", - ) - group.add_argument("--token_num_relax", type=int, default=1, help="") - group.add_argument("--decoding_ind", type=int, default=0, help="") - group.add_argument("--decoding_mode", type=str, default="model1", help="") - group.add_argument( - "--ctc_weight2", - type=float, - default=0.0, - help="CTC weight in joint decoding", - ) - return parser - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - parser = get_parser() - args = parser.parse_args(cmd) - kwargs = vars(args) - kwargs.pop("config", None) - inference(**kwargs) - - -if __name__ == "__main__": - main() diff --git a/funasr/bin/diar_infer.py b/funasr/bin/diar_infer.py new file mode 100755 index 000000000..f698a6650 --- /dev/null +++ b/funasr/bin/diar_infer.py @@ -0,0 +1,346 @@ +#!/usr/bin/env python3 +# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. +# MIT License (https://opensource.org/licenses/MIT) + +import argparse +import logging +import os +import sys +from pathlib import Path +from typing import Any +from typing import List +from typing import Optional +from typing import Sequence +from typing import Tuple +from typing import Union + +from collections import OrderedDict +import numpy as np +import soundfile +import torch +from torch.nn import functional as F +from typeguard import check_argument_types +from typeguard import check_return_type + +from funasr.utils.cli_utils import get_commandline_args +from funasr.tasks.diar import DiarTask +from funasr.tasks.diar import EENDOLADiarTask +from funasr.torch_utils.device_funcs import to_device +from funasr.torch_utils.set_all_random_seed import set_all_random_seed +from funasr.utils import config_argparse +from funasr.utils.types import str2bool +from funasr.utils.types import str2triple_str +from funasr.utils.types import str_or_none +from scipy.ndimage import median_filter +from funasr.utils.misc import statistic_model_parameters +from funasr.datasets.iterable_dataset import load_bytes +from funasr.models.frontend.wav_frontend import WavFrontendMel23 + +class Speech2DiarizationEEND: + """Speech2Diarlization class + + Examples: + >>> import soundfile + >>> import numpy as np + >>> speech2diar = Speech2DiarizationEEND("diar_sond_config.yml", "diar_sond.pb") + >>> profile = np.load("profiles.npy") + >>> audio, rate = soundfile.read("speech.wav") + >>> speech2diar(audio, profile) + {"spk1": [(int, int), ...], ...} + + """ + + def __init__( + self, + diar_train_config: Union[Path, str] = None, + diar_model_file: Union[Path, str] = None, + device: str = "cpu", + dtype: str = "float32", + ): + assert check_argument_types() + + # 1. Build Diarization model + diar_model, diar_train_args = EENDOLADiarTask.build_model_from_file( + config_file=diar_train_config, + model_file=diar_model_file, + device=device + ) + frontend = None + if diar_train_args.frontend is not None and diar_train_args.frontend_conf is not None: + frontend = WavFrontendMel23(**diar_train_args.frontend_conf) + + # set up seed for eda + np.random.seed(diar_train_args.seed) + torch.manual_seed(diar_train_args.seed) + torch.cuda.manual_seed(diar_train_args.seed) + os.environ['PYTORCH_SEED'] = str(diar_train_args.seed) + logging.info("diar_model: {}".format(diar_model)) + logging.info("diar_train_args: {}".format(diar_train_args)) + diar_model.to(dtype=getattr(torch, dtype)).eval() + + self.diar_model = diar_model + self.diar_train_args = diar_train_args + self.device = device + self.dtype = dtype + self.frontend = frontend + + @torch.no_grad() + def __call__( + self, + speech: Union[torch.Tensor, np.ndarray], + speech_lengths: Union[torch.Tensor, np.ndarray] = None + ): + """Inference + + Args: + speech: Input speech data + Returns: + diarization results + + """ + assert check_argument_types() + # Input as audio signal + if isinstance(speech, np.ndarray): + speech = torch.tensor(speech) + + if self.frontend is not None: + feats, feats_len = self.frontend.forward(speech, speech_lengths) + feats = to_device(feats, device=self.device) + feats_len = feats_len.int() + self.diar_model.frontend = None + else: + feats = speech + feats_len = speech_lengths + batch = {"speech": feats, "speech_lengths": feats_len} + batch = to_device(batch, device=self.device) + results = self.diar_model.estimate_sequential(**batch) + + return results + + @staticmethod + def from_pretrained( + model_tag: Optional[str] = None, + **kwargs: Optional[Any], + ): + """Build Speech2Diarization instance from the pretrained model. + + Args: + model_tag (Optional[str]): Model tag of the pretrained models. + Currently, the tags of espnet_model_zoo are supported. + + Returns: + Speech2Diarization: Speech2Diarization instance. + + """ + if model_tag is not None: + try: + from espnet_model_zoo.downloader import ModelDownloader + + except ImportError: + logging.error( + "`espnet_model_zoo` is not installed. " + "Please install via `pip install -U espnet_model_zoo`." + ) + raise + d = ModelDownloader() + kwargs.update(**d.download_and_unpack(model_tag)) + + return Speech2DiarizationEEND(**kwargs) + + +class Speech2DiarizationSOND: + """Speech2Xvector class + + Examples: + >>> import soundfile + >>> import numpy as np + >>> speech2diar = Speech2DiarizationSOND("diar_sond_config.yml", "diar_sond.pb") + >>> profile = np.load("profiles.npy") + >>> audio, rate = soundfile.read("speech.wav") + >>> speech2diar(audio, profile) + {"spk1": [(int, int), ...], ...} + + """ + + def __init__( + self, + diar_train_config: Union[Path, str] = None, + diar_model_file: Union[Path, str] = None, + device: Union[str, torch.device] = "cpu", + batch_size: int = 1, + dtype: str = "float32", + streaming: bool = False, + smooth_size: int = 83, + dur_threshold: float = 10, + ): + assert check_argument_types() + + # TODO: 1. Build Diarization model + diar_model, diar_train_args = DiarTask.build_model_from_file( + config_file=diar_train_config, + model_file=diar_model_file, + device=device + ) + logging.info("diar_model: {}".format(diar_model)) + logging.info("model parameter number: {}".format(statistic_model_parameters(diar_model))) + logging.info("diar_train_args: {}".format(diar_train_args)) + diar_model.to(dtype=getattr(torch, dtype)).eval() + + self.diar_model = diar_model + self.diar_train_args = diar_train_args + self.token_list = diar_train_args.token_list + self.smooth_size = smooth_size + self.dur_threshold = dur_threshold + self.device = device + self.dtype = dtype + + def smooth_multi_labels(self, multi_label): + multi_label = median_filter(multi_label, (self.smooth_size, 1), mode="constant", cval=0.0).astype(int) + return multi_label + + @staticmethod + def calc_spk_turns(label_arr, spk_list): + turn_list = [] + length = label_arr.shape[0] + n_spk = label_arr.shape[1] + for k in range(n_spk): + if spk_list[k] == "None": + continue + in_utt = False + start = 0 + for i in range(length): + if label_arr[i, k] == 1 and in_utt is False: + start = i + in_utt = True + if label_arr[i, k] == 0 and in_utt is True: + turn_list.append([spk_list[k], start, i - start]) + in_utt = False + if in_utt: + turn_list.append([spk_list[k], start, length - start]) + return turn_list + + @staticmethod + def seq2arr(seq, vec_dim=8): + def int2vec(x, vec_dim=8, dtype=np.int): + b = ('{:0' + str(vec_dim) + 'b}').format(x) + # little-endian order: lower bit first + return (np.array(list(b)[::-1]) == '1').astype(dtype) + + # process oov + seq = np.array([int(x) for x in seq]) + new_seq = [] + for i, x in enumerate(seq): + if x < 2 ** vec_dim: + new_seq.append(x) + else: + idx_list = np.where(seq < 2 ** vec_dim)[0] + idx = np.abs(idx_list - i).argmin() + new_seq.append(seq[idx_list[idx]]) + return np.row_stack([int2vec(x, vec_dim) for x in new_seq]) + + def post_processing(self, raw_logits: torch.Tensor, spk_num: int, output_format: str = "speaker_turn"): + logits_idx = raw_logits.argmax(-1) # B, T, vocab_size -> B, T + # upsampling outputs to match inputs + ut = logits_idx.shape[1] * self.diar_model.encoder.time_ds_ratio + logits_idx = F.upsample( + logits_idx.unsqueeze(1).float(), + size=(ut, ), + mode="nearest", + ).squeeze(1).long() + logits_idx = logits_idx[0].tolist() + pse_labels = [self.token_list[x] for x in logits_idx] + if output_format == "pse_labels": + return pse_labels, None + + multi_labels = self.seq2arr(pse_labels, spk_num)[:, :spk_num] # remove padding speakers + multi_labels = self.smooth_multi_labels(multi_labels) + if output_format == "binary_labels": + return multi_labels, None + + spk_list = ["spk{}".format(i + 1) for i in range(spk_num)] + spk_turns = self.calc_spk_turns(multi_labels, spk_list) + results = OrderedDict() + for spk, st, dur in spk_turns: + if spk not in results: + results[spk] = [] + if dur > self.dur_threshold: + results[spk].append((st, st+dur)) + + # sort segments in start time ascending + for spk in results: + results[spk] = sorted(results[spk], key=lambda x: x[0]) + + return results, pse_labels + + @torch.no_grad() + def __call__( + self, + speech: Union[torch.Tensor, np.ndarray], + profile: Union[torch.Tensor, np.ndarray], + output_format: str = "speaker_turn" + ): + """Inference + + Args: + speech: Input speech data + profile: Speaker profiles + Returns: + diarization results for each speaker + + """ + assert check_argument_types() + # Input as audio signal + if isinstance(speech, np.ndarray): + speech = torch.tensor(speech) + if isinstance(profile, np.ndarray): + profile = torch.tensor(profile) + + # data: (Nsamples,) -> (1, Nsamples) + speech = speech.unsqueeze(0).to(getattr(torch, self.dtype)) + profile = profile.unsqueeze(0).to(getattr(torch, self.dtype)) + # lengths: (1,) + speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1)) + profile_lengths = profile.new_full([1], dtype=torch.long, fill_value=profile.size(1)) + batch = {"speech": speech, "speech_lengths": speech_lengths, + "profile": profile, "profile_lengths": profile_lengths} + # a. To device + batch = to_device(batch, device=self.device) + + logits = self.diar_model.prediction_forward(**batch) + results, pse_labels = self.post_processing(logits, profile.shape[1], output_format) + + return results, pse_labels + + @staticmethod + def from_pretrained( + model_tag: Optional[str] = None, + **kwargs: Optional[Any], + ): + """Build Speech2Xvector instance from the pretrained model. + + Args: + model_tag (Optional[str]): Model tag of the pretrained models. + Currently, the tags of espnet_model_zoo are supported. + + Returns: + Speech2Xvector: Speech2Xvector instance. + + """ + if model_tag is not None: + try: + from espnet_model_zoo.downloader import ModelDownloader + + except ImportError: + logging.error( + "`espnet_model_zoo` is not installed. " + "Please install via `pip install -U espnet_model_zoo`." + ) + raise + d = ModelDownloader() + kwargs.update(**d.download_and_unpack(model_tag)) + + return Speech2DiarizationSOND(**kwargs) + + + + diff --git a/funasr/bin/diar_inference_launch.py b/funasr/bin/diar_inference_launch.py index 07974c072..08004e89b 100755 --- a/funasr/bin/diar_inference_launch.py +++ b/funasr/bin/diar_inference_launch.py @@ -15,6 +15,352 @@ from funasr.utils.types import str2bool from funasr.utils.types import str2triple_str from funasr.utils.types import str_or_none +import argparse +import logging +import os +import sys +from pathlib import Path +from typing import Any +from typing import List +from typing import Optional +from typing import Sequence +from typing import Tuple +from typing import Union + +from collections import OrderedDict +import numpy as np +import soundfile +import torch +from torch.nn import functional as F +from typeguard import check_argument_types +from typeguard import check_return_type +from scipy.signal import medfilt +from funasr.utils.cli_utils import get_commandline_args +from funasr.tasks.diar import DiarTask +from funasr.tasks.asr import ASRTask +from funasr.tasks.diar import EENDOLADiarTask +from funasr.torch_utils.device_funcs import to_device +from funasr.torch_utils.set_all_random_seed import set_all_random_seed +from funasr.utils import config_argparse +from funasr.utils.types import str2bool +from funasr.utils.types import str2triple_str +from funasr.utils.types import str_or_none +from scipy.ndimage import median_filter +from funasr.utils.misc import statistic_model_parameters +from funasr.datasets.iterable_dataset import load_bytes +from funasr.bin.diar_infer import Speech2DiarizationSOND, Speech2DiarizationEEND + +def inference_sond( + diar_train_config: str, + diar_model_file: str, + output_dir: Optional[str] = None, + batch_size: int = 1, + dtype: str = "float32", + ngpu: int = 0, + seed: int = 0, + num_workers: int = 0, + log_level: Union[int, str] = "INFO", + key_file: Optional[str] = None, + model_tag: Optional[str] = None, + allow_variable_data_keys: bool = True, + streaming: bool = False, + smooth_size: int = 83, + dur_threshold: int = 10, + out_format: str = "vad", + param_dict: Optional[dict] = None, + mode: str = "sond", + **kwargs, +): + assert check_argument_types() + ncpu = kwargs.get("ncpu", 1) + torch.set_num_threads(ncpu) + if batch_size > 1: + raise NotImplementedError("batch decoding is not implemented") + if ngpu > 1: + raise NotImplementedError("only single GPU decoding is supported") + + logging.basicConfig( + level=log_level, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + logging.info("param_dict: {}".format(param_dict)) + + if ngpu >= 1 and torch.cuda.is_available(): + device = "cuda" + else: + device = "cpu" + + # 1. Set random-seed + set_all_random_seed(seed) + + # 2a. Build speech2xvec [Optional] + if mode == "sond_demo" and param_dict is not None and "extract_profile" in param_dict and param_dict["extract_profile"]: + assert "sv_train_config" in param_dict, "sv_train_config must be provided param_dict." + assert "sv_model_file" in param_dict, "sv_model_file must be provided in param_dict." + sv_train_config = param_dict["sv_train_config"] + sv_model_file = param_dict["sv_model_file"] + if "model_dir" in param_dict: + sv_train_config = os.path.join(param_dict["model_dir"], sv_train_config) + sv_model_file = os.path.join(param_dict["model_dir"], sv_model_file) + from funasr.bin.sv_infer import Speech2Xvector + speech2xvector_kwargs = dict( + sv_train_config=sv_train_config, + sv_model_file=sv_model_file, + device=device, + dtype=dtype, + streaming=streaming, + embedding_node="resnet1_dense" + ) + logging.info("speech2xvector_kwargs: {}".format(speech2xvector_kwargs)) + speech2xvector = Speech2Xvector.from_pretrained( + model_tag=model_tag, + **speech2xvector_kwargs, + ) + speech2xvector.sv_model.eval() + + # 2b. Build speech2diar + speech2diar_kwargs = dict( + diar_train_config=diar_train_config, + diar_model_file=diar_model_file, + device=device, + dtype=dtype, + streaming=streaming, + smooth_size=smooth_size, + dur_threshold=dur_threshold, + ) + logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs)) + speech2diar = Speech2DiarizationSOND.from_pretrained( + model_tag=model_tag, + **speech2diar_kwargs, + ) + speech2diar.diar_model.eval() + + def output_results_str(results: dict, uttid: str): + rst = [] + mid = uttid.rsplit("-", 1)[0] + for key in results: + results[key] = [(x[0]/100, x[1]/100) for x in results[key]] + if out_format == "vad": + for spk, segs in results.items(): + rst.append("{} {}".format(spk, segs)) + else: + template = "SPEAKER {} 0 {:.2f} {:.2f} {} " + for spk, segs in results.items(): + rst.extend([template.format(mid, st, ed, spk) for st, ed in segs]) + + return "\n".join(rst) + + def _forward( + data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None, + raw_inputs: List[List[Union[np.ndarray, torch.Tensor, str, bytes]]] = None, + output_dir_v2: Optional[str] = None, + param_dict: Optional[dict] = None, + ): + logging.info("param_dict: {}".format(param_dict)) + if data_path_and_name_and_type is None and raw_inputs is not None: + if isinstance(raw_inputs, (list, tuple)): + if not isinstance(raw_inputs[0], List): + raw_inputs = [raw_inputs] + + assert all([len(example) >= 2 for example in raw_inputs]), \ + "The length of test case in raw_inputs must larger than 1 (>=2)." + + def prepare_dataset(): + for idx, example in enumerate(raw_inputs): + # read waveform file + example = [load_bytes(x) if isinstance(x, bytes) else x + for x in example] + example = [soundfile.read(x)[0] if isinstance(x, str) else x + for x in example] + # convert torch tensor to numpy array + example = [x.numpy() if isinstance(example[0], torch.Tensor) else x + for x in example] + speech = example[0] + logging.info("Extracting profiles for {} waveforms".format(len(example)-1)) + profile = [speech2xvector.calculate_embedding(x) for x in example[1:]] + profile = torch.cat(profile, dim=0) + yield ["test{}".format(idx)], {"speech": [speech], "profile": [profile]} + + loader = prepare_dataset() + else: + raise TypeError("raw_inputs must be a list or tuple in [speech, profile1, profile2, ...] ") + else: + # 3. Build data-iterator + loader = ASRTask.build_streaming_iterator( + data_path_and_name_and_type, + dtype=dtype, + batch_size=batch_size, + key_file=key_file, + num_workers=num_workers, + preprocess_fn=None, + collate_fn=None, + allow_variable_data_keys=allow_variable_data_keys, + inference=True, + ) + + # 7. Start for-loop + output_path = output_dir_v2 if output_dir_v2 is not None else output_dir + if output_path is not None: + os.makedirs(output_path, exist_ok=True) + output_writer = open("{}/result.txt".format(output_path), "w") + pse_label_writer = open("{}/labels.txt".format(output_path), "w") + logging.info("Start to diarize...") + result_list = [] + for idx, (keys, batch) in enumerate(loader): + assert isinstance(batch, dict), type(batch) + assert all(isinstance(s, str) for s in keys), keys + _bs = len(next(iter(batch.values()))) + assert len(keys) == _bs, f"{len(keys)} != {_bs}" + batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} + + results, pse_labels = speech2diar(**batch) + # Only supporting batch_size==1 + key, value = keys[0], output_results_str(results, keys[0]) + item = {"key": key, "value": value} + result_list.append(item) + if output_path is not None: + output_writer.write(value) + output_writer.flush() + pse_label_writer.write("{} {}\n".format(key, " ".join(pse_labels))) + pse_label_writer.flush() + + if idx % 100 == 0: + logging.info("Processing {:5d}: {}".format(idx, key)) + + if output_path is not None: + output_writer.close() + pse_label_writer.close() + + return result_list + + return _forward + +def inference_eend( + diar_train_config: str, + diar_model_file: str, + output_dir: Optional[str] = None, + batch_size: int = 1, + dtype: str = "float32", + ngpu: int = 1, + num_workers: int = 0, + log_level: Union[int, str] = "INFO", + key_file: Optional[str] = None, + model_tag: Optional[str] = None, + allow_variable_data_keys: bool = True, + streaming: bool = False, + param_dict: Optional[dict] = None, + **kwargs, +): + assert check_argument_types() + ncpu = kwargs.get("ncpu", 1) + torch.set_num_threads(ncpu) + if batch_size > 1: + raise NotImplementedError("batch decoding is not implemented") + if ngpu > 1: + raise NotImplementedError("only single GPU decoding is supported") + + logging.basicConfig( + level=log_level, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + logging.info("param_dict: {}".format(param_dict)) + + if ngpu >= 1 and torch.cuda.is_available(): + device = "cuda" + else: + device = "cpu" + + # 1. Build speech2diar + speech2diar_kwargs = dict( + diar_train_config=diar_train_config, + diar_model_file=diar_model_file, + device=device, + dtype=dtype, + ) + logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs)) + speech2diar = Speech2DiarizationEEND.from_pretrained( + model_tag=model_tag, + **speech2diar_kwargs, + ) + speech2diar.diar_model.eval() + + def output_results_str(results: dict, uttid: str): + rst = [] + mid = uttid.rsplit("-", 1)[0] + for key in results: + results[key] = [(x[0] / 100, x[1] / 100) for x in results[key]] + template = "SPEAKER {} 0 {:.2f} {:.2f} {} " + for spk, segs in results.items(): + rst.extend([template.format(mid, st, ed, spk) for st, ed in segs]) + + return "\n".join(rst) + + def _forward( + data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None, + raw_inputs: List[List[Union[np.ndarray, torch.Tensor, str, bytes]]] = None, + output_dir_v2: Optional[str] = None, + param_dict: Optional[dict] = None, + ): + # 2. Build data-iterator + if data_path_and_name_and_type is None and raw_inputs is not None: + if isinstance(raw_inputs, torch.Tensor): + raw_inputs = raw_inputs.numpy() + data_path_and_name_and_type = [raw_inputs[0], "speech", "sound"] + loader = EENDOLADiarTask.build_streaming_iterator( + data_path_and_name_and_type, + dtype=dtype, + batch_size=batch_size, + key_file=key_file, + num_workers=num_workers, + preprocess_fn=EENDOLADiarTask.build_preprocess_fn(speech2diar.diar_train_args, False), + collate_fn=EENDOLADiarTask.build_collate_fn(speech2diar.diar_train_args, False), + allow_variable_data_keys=allow_variable_data_keys, + inference=True, + ) + + # 3. Start for-loop + output_path = output_dir_v2 if output_dir_v2 is not None else output_dir + if output_path is not None: + os.makedirs(output_path, exist_ok=True) + output_writer = open("{}/result.txt".format(output_path), "w") + result_list = [] + for keys, batch in loader: + assert isinstance(batch, dict), type(batch) + assert all(isinstance(s, str) for s in keys), keys + _bs = len(next(iter(batch.values()))) + assert len(keys) == _bs, f"{len(keys)} != {_bs}" + # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} + + results = speech2diar(**batch) + + # post process + a = results[0][0].cpu().numpy() + a = medfilt(a, (11, 1)) + rst = [] + for spkid, frames in enumerate(a.T): + frames = np.pad(frames, (1, 1), 'constant') + changes, = np.where(np.diff(frames, axis=0) != 0) + fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} {:s} " + for s, e in zip(changes[::2], changes[1::2]): + st = s / 10. + dur = (e - s) / 10. + rst.append(fmt.format(keys[0], st, dur, "{}_{}".format(keys[0], str(spkid)))) + + # Only supporting batch_size==1 + value = "\n".join(rst) + item = {"key": keys[0], "value": value} + result_list.append(item) + if output_path is not None: + output_writer.write(value) + output_writer.flush() + + if output_path is not None: + output_writer.close() + + return result_list + + return _forward + def get_parser(): parser = config_argparse.ArgumentParser( @@ -127,10 +473,8 @@ def get_parser(): def inference_launch(mode, **kwargs): if mode == "sond": - from funasr.bin.sond_inference import inference_modelscope - return inference_modelscope(mode=mode, **kwargs) + return inference_sond(mode=mode, **kwargs) elif mode == "sond_demo": - from funasr.bin.sond_inference import inference_modelscope param_dict = { "extract_profile": True, "sv_train_config": "sv.yaml", @@ -142,10 +486,9 @@ def inference_launch(mode, **kwargs): kwargs["param_dict"][key] = param_dict[key] else: kwargs["param_dict"] = param_dict - return inference_modelscope(mode=mode, **kwargs) + return inference_sond(mode=mode, **kwargs) elif mode == "eend-ola": - from funasr.bin.eend_ola_inference import inference_modelscope - return inference_modelscope(mode=mode, **kwargs) + return inference_eend(mode=mode, **kwargs) else: logging.info("Unknown decoding mode: {}".format(mode)) return None @@ -178,7 +521,8 @@ def main(cmd=None): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = gpuid - inference_launch(**kwargs) + inference_pipeline = inference_launch(**kwargs) + return inference_pipeline(kwargs["data_path_and_name_and_type"]) if __name__ == "__main__": diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py deleted file mode 100755 index 87816dd22..000000000 --- a/funasr/bin/eend_ola_inference.py +++ /dev/null @@ -1,429 +0,0 @@ -#!/usr/bin/env python3 -# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. -# MIT License (https://opensource.org/licenses/MIT) - -import argparse -import logging -import os -import sys -from pathlib import Path -from typing import Any -from typing import List -from typing import Optional -from typing import Sequence -from typing import Tuple -from typing import Union - -import numpy as np -import torch -from scipy.signal import medfilt -from typeguard import check_argument_types - -from funasr.models.frontend.wav_frontend import WavFrontendMel23 -from funasr.tasks.diar import EENDOLADiarTask -from funasr.torch_utils.device_funcs import to_device -from funasr.utils import config_argparse -from funasr.utils.cli_utils import get_commandline_args -from funasr.utils.types import str2bool -from funasr.utils.types import str2triple_str -from funasr.utils.types import str_or_none - - -class Speech2Diarization: - """Speech2Diarlization class - - Examples: - >>> import soundfile - >>> import numpy as np - >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb") - >>> profile = np.load("profiles.npy") - >>> audio, rate = soundfile.read("speech.wav") - >>> speech2diar(audio, profile) - {"spk1": [(int, int), ...], ...} - - """ - - def __init__( - self, - diar_train_config: Union[Path, str] = None, - diar_model_file: Union[Path, str] = None, - device: str = "cpu", - dtype: str = "float32", - ): - assert check_argument_types() - - # 1. Build Diarization model - diar_model, diar_train_args = EENDOLADiarTask.build_model_from_file( - config_file=diar_train_config, - model_file=diar_model_file, - device=device - ) - frontend = None - if diar_train_args.frontend is not None and diar_train_args.frontend_conf is not None: - frontend = WavFrontendMel23(**diar_train_args.frontend_conf) - - # set up seed for eda - np.random.seed(diar_train_args.seed) - torch.manual_seed(diar_train_args.seed) - torch.cuda.manual_seed(diar_train_args.seed) - os.environ['PYTORCH_SEED'] = str(diar_train_args.seed) - logging.info("diar_model: {}".format(diar_model)) - logging.info("diar_train_args: {}".format(diar_train_args)) - diar_model.to(dtype=getattr(torch, dtype)).eval() - - self.diar_model = diar_model - self.diar_train_args = diar_train_args - self.device = device - self.dtype = dtype - self.frontend = frontend - - @torch.no_grad() - def __call__( - self, - speech: Union[torch.Tensor, np.ndarray], - speech_lengths: Union[torch.Tensor, np.ndarray] = None - ): - """Inference - - Args: - speech: Input speech data - Returns: - diarization results - - """ - assert check_argument_types() - # Input as audio signal - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - - if self.frontend is not None: - feats, feats_len = self.frontend.forward(speech, speech_lengths) - feats = to_device(feats, device=self.device) - feats_len = feats_len.int() - self.diar_model.frontend = None - else: - feats = speech - feats_len = speech_lengths - batch = {"speech": feats, "speech_lengths": feats_len} - batch = to_device(batch, device=self.device) - results = self.diar_model.estimate_sequential(**batch) - - return results - - @staticmethod - def from_pretrained( - model_tag: Optional[str] = None, - **kwargs: Optional[Any], - ): - """Build Speech2Diarization instance from the pretrained model. - - Args: - model_tag (Optional[str]): Model tag of the pretrained models. - Currently, the tags of espnet_model_zoo are supported. - - Returns: - Speech2Diarization: Speech2Diarization instance. - - """ - if model_tag is not None: - try: - from espnet_model_zoo.downloader import ModelDownloader - - except ImportError: - logging.error( - "`espnet_model_zoo` is not installed. " - "Please install via `pip install -U espnet_model_zoo`." - ) - raise - d = ModelDownloader() - kwargs.update(**d.download_and_unpack(model_tag)) - - return Speech2Diarization(**kwargs) - - -def inference_modelscope( - diar_train_config: str, - diar_model_file: str, - output_dir: Optional[str] = None, - batch_size: int = 1, - dtype: str = "float32", - ngpu: int = 1, - num_workers: int = 0, - log_level: Union[int, str] = "INFO", - key_file: Optional[str] = None, - model_tag: Optional[str] = None, - allow_variable_data_keys: bool = True, - streaming: bool = False, - param_dict: Optional[dict] = None, - **kwargs, -): - assert check_argument_types() - ncpu = kwargs.get("ncpu", 1) - torch.set_num_threads(ncpu) - if batch_size > 1: - raise NotImplementedError("batch decoding is not implemented") - if ngpu > 1: - raise NotImplementedError("only single GPU decoding is supported") - - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - logging.info("param_dict: {}".format(param_dict)) - - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - - # 1. Build speech2diar - speech2diar_kwargs = dict( - diar_train_config=diar_train_config, - diar_model_file=diar_model_file, - device=device, - dtype=dtype, - ) - logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs)) - speech2diar = Speech2Diarization.from_pretrained( - model_tag=model_tag, - **speech2diar_kwargs, - ) - speech2diar.diar_model.eval() - - def output_results_str(results: dict, uttid: str): - rst = [] - mid = uttid.rsplit("-", 1)[0] - for key in results: - results[key] = [(x[0] / 100, x[1] / 100) for x in results[key]] - template = "SPEAKER {} 0 {:.2f} {:.2f} {} " - for spk, segs in results.items(): - rst.extend([template.format(mid, st, ed, spk) for st, ed in segs]) - - return "\n".join(rst) - - def _forward( - data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None, - raw_inputs: List[List[Union[np.ndarray, torch.Tensor, str, bytes]]] = None, - output_dir_v2: Optional[str] = None, - param_dict: Optional[dict] = None, - ): - # 2. Build data-iterator - if data_path_and_name_and_type is None and raw_inputs is not None: - if isinstance(raw_inputs, torch.Tensor): - raw_inputs = raw_inputs.numpy() - data_path_and_name_and_type = [raw_inputs[0], "speech", "sound"] - loader = EENDOLADiarTask.build_streaming_iterator( - data_path_and_name_and_type, - dtype=dtype, - batch_size=batch_size, - key_file=key_file, - num_workers=num_workers, - preprocess_fn=EENDOLADiarTask.build_preprocess_fn(speech2diar.diar_train_args, False), - collate_fn=EENDOLADiarTask.build_collate_fn(speech2diar.diar_train_args, False), - allow_variable_data_keys=allow_variable_data_keys, - inference=True, - ) - - # 3. Start for-loop - output_path = output_dir_v2 if output_dir_v2 is not None else output_dir - if output_path is not None: - os.makedirs(output_path, exist_ok=True) - output_writer = open("{}/result.txt".format(output_path), "w") - result_list = [] - for keys, batch in loader: - assert isinstance(batch, dict), type(batch) - assert all(isinstance(s, str) for s in keys), keys - _bs = len(next(iter(batch.values()))) - assert len(keys) == _bs, f"{len(keys)} != {_bs}" - # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} - - results = speech2diar(**batch) - - # post process - a = results[0][0].cpu().numpy() - a = medfilt(a, (11, 1)) - rst = [] - for spkid, frames in enumerate(a.T): - frames = np.pad(frames, (1, 1), 'constant') - changes, = np.where(np.diff(frames, axis=0) != 0) - fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} {:s} " - for s, e in zip(changes[::2], changes[1::2]): - st = s / 10. - dur = (e - s) / 10. - rst.append(fmt.format(keys[0], st, dur, "{}_{}".format(keys[0], str(spkid)))) - - # Only supporting batch_size==1 - value = "\n".join(rst) - item = {"key": keys[0], "value": value} - result_list.append(item) - if output_path is not None: - output_writer.write(value) - output_writer.flush() - - if output_path is not None: - output_writer.close() - - return result_list - - return _forward - - -def inference( - data_path_and_name_and_type: Sequence[Tuple[str, str, str]], - diar_train_config: Optional[str], - diar_model_file: Optional[str], - output_dir: Optional[str] = None, - batch_size: int = 1, - dtype: str = "float32", - ngpu: int = 0, - seed: int = 0, - num_workers: int = 1, - log_level: Union[int, str] = "INFO", - key_file: Optional[str] = None, - model_tag: Optional[str] = None, - allow_variable_data_keys: bool = True, - streaming: bool = False, - smooth_size: int = 83, - dur_threshold: int = 10, - out_format: str = "vad", - **kwargs, -): - inference_pipeline = inference_modelscope( - diar_train_config=diar_train_config, - diar_model_file=diar_model_file, - output_dir=output_dir, - batch_size=batch_size, - dtype=dtype, - ngpu=ngpu, - seed=seed, - num_workers=num_workers, - log_level=log_level, - key_file=key_file, - model_tag=model_tag, - allow_variable_data_keys=allow_variable_data_keys, - streaming=streaming, - smooth_size=smooth_size, - dur_threshold=dur_threshold, - out_format=out_format, - **kwargs, - ) - - return inference_pipeline(data_path_and_name_and_type, raw_inputs=None) - - -def get_parser(): - parser = config_argparse.ArgumentParser( - description="Speaker verification/x-vector extraction", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Note(kamo): Use '_' instead of '-' as separator. - # '-' is confusing if written in yaml. - parser.add_argument( - "--log_level", - type=lambda x: x.upper(), - default="INFO", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), - help="The verbose level of logging", - ) - - parser.add_argument("--output_dir", type=str, required=False) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - parser.add_argument( - "--gpuid_list", - type=str, - default="", - help="The visible gpus", - ) - parser.add_argument("--seed", type=int, default=0, help="Random seed") - parser.add_argument( - "--dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type", - ) - parser.add_argument( - "--num_workers", - type=int, - default=1, - help="The number of workers used for DataLoader", - ) - - group = parser.add_argument_group("Input data related") - group.add_argument( - "--data_path_and_name_and_type", - type=str2triple_str, - required=False, - action="append", - ) - group.add_argument("--key_file", type=str_or_none) - group.add_argument("--allow_variable_data_keys", type=str2bool, default=False) - - group = parser.add_argument_group("The model configuration related") - group.add_argument( - "--diar_train_config", - type=str, - help="diarization training configuration", - ) - group.add_argument( - "--diar_model_file", - type=str, - help="diarization model parameter file", - ) - group.add_argument( - "--dur_threshold", - type=int, - default=10, - help="The threshold for short segments in number frames" - ) - parser.add_argument( - "--smooth_size", - type=int, - default=83, - help="The smoothing window length in number frames" - ) - group.add_argument( - "--model_tag", - type=str, - help="Pretrained model tag. If specify this option, *_train_config and " - "*_file will be overwritten", - ) - parser.add_argument( - "--batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - parser.add_argument("--streaming", type=str2bool, default=False) - - return parser - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - parser = get_parser() - args = parser.parse_args(cmd) - kwargs = vars(args) - kwargs.pop("config", None) - logging.info("args: {}".format(kwargs)) - if args.output_dir is None: - jobid, n_gpu = 1, 1 - gpuid = args.gpuid_list.split(",")[jobid - 1] - else: - jobid = int(args.output_dir.split(".")[-1]) - n_gpu = len(args.gpuid_list.split(",")) - gpuid = args.gpuid_list.split(",")[(jobid - 1) % n_gpu] - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = gpuid - results_list = inference(**kwargs) - for results in results_list: - print("{} {}".format(results["key"], results["value"])) - - -if __name__ == "__main__": - main() diff --git a/funasr/bin/modelscope_infer.py b/funasr/bin/modelscope_infer.py deleted file mode 100755 index bc24340b5..000000000 --- a/funasr/bin/modelscope_infer.py +++ /dev/null @@ -1,90 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import logging -import os - -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -if __name__ == '__main__': - parser = argparse.ArgumentParser( - description="decoding configs", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - parser.add_argument("--model_name", - type=str, - default="speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", - help="model name in modelscope") - parser.add_argument("--model_revision", - type=str, - default="v1.0.4", - help="model revision in modelscope") - parser.add_argument("--local_model_path", - type=str, - default=None, - help="local model path, usually for fine-tuning") - parser.add_argument("--wav_list", - type=str, - help="input wav list") - parser.add_argument("--output_file", - type=str, - help="saving decoding results") - parser.add_argument( - "--njob", - type=int, - default=1, - help="The number of jobs for each gpu", - ) - parser.add_argument( - "--gpuid_list", - type=str, - default="", - help="The visible gpus", - ) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - args = parser.parse_args() - - # set logging messages - logging.basicConfig( - level=logging.INFO, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - logging.info("Decoding args: {}".format(args)) - - # gpu setting - if args.ngpu > 0: - jobid = int(args.output_file.split(".")[-1]) - gpuid = args.gpuid_list.split(",")[(jobid - 1) // args.njob] - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = gpuid - - if args.local_model_path is None: - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/{}".format(args.model_name), - model_revision=args.model_revision) - else: - inference_pipeline = pipeline( - task=Tasks.auto_speech_recognition, - model=args.local_model_path) - - - with open(args.wav_list, 'r') as f_wav: - wav_lines = f_wav.readlines() - - with open(args.output_file, "w") as f_out: - for line in wav_lines: - wav_id, wav_path = line.strip().split() - logging.info("decoding, utt_id: ['{}']".format(wav_id)) - rec_result = inference_pipeline(audio_in=wav_path) - if 'text' in rec_result: - text = rec_result["text"] - else: - text = '' - f_out.write(wav_id + " " + text + "\n") - logging.info("best hypo: {} \n".format(text)) diff --git a/funasr/bin/punctuation_infer_vadrealtime.py b/funasr/bin/punc_infer.py similarity index 53% rename from funasr/bin/punctuation_infer_vadrealtime.py rename to funasr/bin/punc_infer.py index 0dc01f531..41c4da323 100644 --- a/funasr/bin/punctuation_infer_vadrealtime.py +++ b/funasr/bin/punc_infer.py @@ -61,16 +61,10 @@ class Text2Punc: text_name="text", non_linguistic_symbols=train_args.non_linguistic_symbols, ) - @torch.no_grad() - def __call__(self, text: Union[list, str], cache: list, split_size=20): - if cache is not None and len(cache) > 0: - precache = "".join(cache) - else: - precache = "" - cache = [] - data = {"text": precache + " " + text} + def __call__(self, text: Union[list, str], split_size=20): + data = {"text": text} result = self.preprocessor(data=data, uid="12938712838719") split_text = self.preprocessor.pop_split_text_data(result) mini_sentences = split_to_mini_sentence(split_text, split_size) @@ -78,10 +72,9 @@ class Text2Punc: assert len(mini_sentences) == len(mini_sentences_id) cache_sent = [] cache_sent_id = torch.from_numpy(np.array([], dtype='int32')) - sentence_punc_list = [] - sentence_words_list= [] + new_mini_sentence = "" + new_mini_sentence_punc = [] cache_pop_trigger_limit = 200 - skip_num = 0 for mini_sentence_i in range(len(mini_sentences)): mini_sentence = mini_sentences[mini_sentence_i] mini_sentence_id = mini_sentences_id[mini_sentence_i] @@ -90,7 +83,6 @@ class Text2Punc: data = { "text": torch.unsqueeze(torch.from_numpy(mini_sentence_id), 0), "text_lengths": torch.from_numpy(np.array([len(mini_sentence_id)], dtype='int32')), - "vad_indexes": torch.from_numpy(np.array([len(cache)], dtype='int32')), } data = to_device(data, self.device) y, _ = self.wrapped_model(**data) @@ -110,7 +102,7 @@ class Text2Punc: break if last_comma_index < 0 and self.punc_list[punctuations[i]] == ",": last_comma_index = i - + if sentenceEnd < 0 and len(mini_sentence) > cache_pop_trigger_limit and last_comma_index >= 0: # The sentence it too long, cut off at a comma. sentenceEnd = last_comma_index @@ -120,10 +112,130 @@ class Text2Punc: mini_sentence = mini_sentence[0:sentenceEnd + 1] punctuations = punctuations[0:sentenceEnd + 1] + # if len(punctuations) == 0: + # continue + + punctuations_np = punctuations.cpu().numpy() + new_mini_sentence_punc += [int(x) for x in punctuations_np] + words_with_punc = [] + for i in range(len(mini_sentence)): + if i > 0: + if len(mini_sentence[i][0].encode()) == 1 and len(mini_sentence[i - 1][0].encode()) == 1: + mini_sentence[i] = " " + mini_sentence[i] + words_with_punc.append(mini_sentence[i]) + if self.punc_list[punctuations[i]] != "_": + words_with_punc.append(self.punc_list[punctuations[i]]) + new_mini_sentence += "".join(words_with_punc) + # Add Period for the end of the sentence + new_mini_sentence_out = new_mini_sentence + new_mini_sentence_punc_out = new_mini_sentence_punc + if mini_sentence_i == len(mini_sentences) - 1: + if new_mini_sentence[-1] == "," or new_mini_sentence[-1] == "、": + new_mini_sentence_out = new_mini_sentence[:-1] + "。" + new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period] + elif new_mini_sentence[-1] != "。" and new_mini_sentence[-1] != "?": + new_mini_sentence_out = new_mini_sentence + "。" + new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period] + return new_mini_sentence_out, new_mini_sentence_punc_out + + +class Text2PuncVADRealtime: + + def __init__( + self, + train_config: Optional[str], + model_file: Optional[str], + device: str = "cpu", + dtype: str = "float32", + ): + # Build Model + model, train_args = PunctuationTask.build_model_from_file(train_config, model_file, device) + self.device = device + # Wrape model to make model.nll() data-parallel + self.wrapped_model = ForwardAdaptor(model, "inference") + self.wrapped_model.to(dtype=getattr(torch, dtype)).to(device=device).eval() + # logging.info(f"Model:\n{model}") + self.punc_list = train_args.punc_list + self.period = 0 + for i in range(len(self.punc_list)): + if self.punc_list[i] == ",": + self.punc_list[i] = "," + elif self.punc_list[i] == "?": + self.punc_list[i] = "?" + elif self.punc_list[i] == "。": + self.period = i + self.preprocessor = CodeMixTokenizerCommonPreprocessor( + train=False, + token_type=train_args.token_type, + token_list=train_args.token_list, + bpemodel=train_args.bpemodel, + text_cleaner=train_args.cleaner, + g2p_type=train_args.g2p, + text_name="text", + non_linguistic_symbols=train_args.non_linguistic_symbols, + ) + + @torch.no_grad() + def __call__(self, text: Union[list, str], cache: list, split_size=20): + if cache is not None and len(cache) > 0: + precache = "".join(cache) + else: + precache = "" + cache = [] + data = {"text": precache + " " + text} + result = self.preprocessor(data=data, uid="12938712838719") + split_text = self.preprocessor.pop_split_text_data(result) + mini_sentences = split_to_mini_sentence(split_text, split_size) + mini_sentences_id = split_to_mini_sentence(data["text"], split_size) + assert len(mini_sentences) == len(mini_sentences_id) + cache_sent = [] + cache_sent_id = torch.from_numpy(np.array([], dtype='int32')) + sentence_punc_list = [] + sentence_words_list = [] + cache_pop_trigger_limit = 200 + skip_num = 0 + for mini_sentence_i in range(len(mini_sentences)): + mini_sentence = mini_sentences[mini_sentence_i] + mini_sentence_id = mini_sentences_id[mini_sentence_i] + mini_sentence = cache_sent + mini_sentence + mini_sentence_id = np.concatenate((cache_sent_id, mini_sentence_id), axis=0) + data = { + "text": torch.unsqueeze(torch.from_numpy(mini_sentence_id), 0), + "text_lengths": torch.from_numpy(np.array([len(mini_sentence_id)], dtype='int32')), + "vad_indexes": torch.from_numpy(np.array([len(cache)], dtype='int32')), + } + data = to_device(data, self.device) + y, _ = self.wrapped_model(**data) + _, indices = y.view(-1, y.shape[-1]).topk(1, dim=1) + punctuations = indices + if indices.size()[0] != 1: + punctuations = torch.squeeze(indices) + assert punctuations.size()[0] == len(mini_sentence) + + # Search for the last Period/QuestionMark as cache + if mini_sentence_i < len(mini_sentences) - 1: + sentenceEnd = -1 + last_comma_index = -1 + for i in range(len(punctuations) - 2, 1, -1): + if self.punc_list[punctuations[i]] == "。" or self.punc_list[punctuations[i]] == "?": + sentenceEnd = i + break + if last_comma_index < 0 and self.punc_list[punctuations[i]] == ",": + last_comma_index = i + + if sentenceEnd < 0 and len(mini_sentence) > cache_pop_trigger_limit and last_comma_index >= 0: + # The sentence it too long, cut off at a comma. + sentenceEnd = last_comma_index + punctuations[sentenceEnd] = self.period + cache_sent = mini_sentence[sentenceEnd + 1:] + cache_sent_id = mini_sentence_id[sentenceEnd + 1:] + mini_sentence = mini_sentence[0:sentenceEnd + 1] + punctuations = punctuations[0:sentenceEnd + 1] + punctuations_np = punctuations.cpu().numpy() sentence_punc_list += [self.punc_list[int(x)] for x in punctuations_np] sentence_words_list += mini_sentence - + assert len(sentence_punc_list) == len(sentence_words_list) words_with_punc = [] sentence_punc_list_out = [] @@ -140,172 +252,16 @@ class Text2Punc: if sentence_punc_list[i] != "_": words_with_punc.append(sentence_punc_list[i]) sentence_out = "".join(words_with_punc) - + sentenceEnd = -1 for i in range(len(sentence_punc_list) - 2, 1, -1): if sentence_punc_list[i] == "。" or sentence_punc_list[i] == "?": - sentenceEnd = i - break - cache_out = sentence_words_list[sentenceEnd + 1 :] + sentenceEnd = i + break + cache_out = sentence_words_list[sentenceEnd + 1:] if sentence_out[-1] in self.punc_list: sentence_out = sentence_out[:-1] sentence_punc_list_out[-1] = "_" return sentence_out, sentence_punc_list_out, cache_out -def inference( - batch_size: int, - dtype: str, - ngpu: int, - seed: int, - num_workers: int, - output_dir: str, - log_level: Union[int, str], - train_config: Optional[str], - model_file: Optional[str], - key_file: Optional[str] = None, - data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None, - raw_inputs: Union[List[Any], bytes, str] = None, - cache: List[Any] = None, - param_dict: dict = None, - **kwargs, -): - inference_pipeline = inference_modelscope( - output_dir=output_dir, - batch_size=batch_size, - dtype=dtype, - ngpu=ngpu, - seed=seed, - num_workers=num_workers, - log_level=log_level, - key_file=key_file, - train_config=train_config, - model_file=model_file, - param_dict=param_dict, - **kwargs, - ) - return inference_pipeline(data_path_and_name_and_type, raw_inputs, cache) - - -def inference_modelscope( - batch_size: int, - dtype: str, - ngpu: int, - seed: int, - num_workers: int, - log_level: Union[int, str], - #cache: list, - key_file: Optional[str], - train_config: Optional[str], - model_file: Optional[str], - output_dir: Optional[str] = None, - param_dict: dict = None, - **kwargs, -): - assert check_argument_types() - ncpu = kwargs.get("ncpu", 1) - torch.set_num_threads(ncpu) - - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - - # 1. Set random-seed - set_all_random_seed(seed) - text2punc = Text2Punc(train_config, model_file, device) - - def _forward( - data_path_and_name_and_type, - raw_inputs: Union[List[Any], bytes, str] = None, - output_dir_v2: Optional[str] = None, - cache: List[Any] = None, - param_dict: dict = None, - ): - results = [] - split_size = 10 - cache_in = param_dict["cache"] - if raw_inputs != None: - line = raw_inputs.strip() - key = "demo" - if line == "": - item = {'key': key, 'value': ""} - results.append(item) - return results - result, _, cache = text2punc(line, cache_in) - param_dict["cache"] = cache - item = {'key': key, 'value': result} - results.append(item) - return results - - return results - - return _forward - - -def get_parser(): - parser = config_argparse.ArgumentParser( - description="Punctuation inference", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - parser.add_argument( - "--log_level", - type=lambda x: x.upper(), - default="INFO", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), - help="The verbose level of logging", - ) - - parser.add_argument("--output_dir", type=str, required=False) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - parser.add_argument("--seed", type=int, default=0, help="Random seed") - parser.add_argument( - "--dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type", - ) - parser.add_argument( - "--num_workers", - type=int, - default=1, - help="The number of workers used for DataLoader", - ) - parser.add_argument( - "--batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - - group = parser.add_argument_group("Input data related") - group.add_argument("--data_path_and_name_and_type", type=str2triple_str, action="append", required=False) - group.add_argument("--raw_inputs", type=str, required=False) - group.add_argument("--cache", type=list, required=False) - group.add_argument("--param_dict", type=dict, required=False) - group.add_argument("--key_file", type=str_or_none) - - group = parser.add_argument_group("The model configuration related") - group.add_argument("--train_config", type=str) - group.add_argument("--model_file", type=str) - - return parser - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - parser = get_parser() - args = parser.parse_args(cmd) - kwargs = vars(args) - # kwargs.pop("config", None) - inference(**kwargs) - - -if __name__ == "__main__": - main() diff --git a/funasr/bin/punc_inference_launch.py b/funasr/bin/punc_inference_launch.py index b1d923553..594a7be21 100755 --- a/funasr/bin/punc_inference_launch.py +++ b/funasr/bin/punc_inference_launch.py @@ -14,6 +14,166 @@ from funasr.utils.types import str2triple_str from funasr.utils.types import str_or_none from funasr.utils.types import float_or_none +import argparse +import logging +from pathlib import Path +import sys +from typing import Optional +from typing import Sequence +from typing import Tuple +from typing import Union +from typing import Any +from typing import List + +import numpy as np +import torch +from typeguard import check_argument_types + +from funasr.datasets.preprocessor import CodeMixTokenizerCommonPreprocessor +from funasr.utils.cli_utils import get_commandline_args +from funasr.tasks.punctuation import PunctuationTask +from funasr.torch_utils.device_funcs import to_device +from funasr.torch_utils.forward_adaptor import ForwardAdaptor +from funasr.torch_utils.set_all_random_seed import set_all_random_seed +from funasr.utils import config_argparse +from funasr.utils.types import str2triple_str +from funasr.utils.types import str_or_none +from funasr.datasets.preprocessor import split_to_mini_sentence +from funasr.bin.punc_infer import Text2Punc, Text2PuncVADRealtime + +def inference_punc( + batch_size: int, + dtype: str, + ngpu: int, + seed: int, + num_workers: int, + log_level: Union[int, str], + key_file: Optional[str], + train_config: Optional[str], + model_file: Optional[str], + output_dir: Optional[str] = None, + param_dict: dict = None, + **kwargs, +): + assert check_argument_types() + logging.basicConfig( + level=log_level, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + + if ngpu >= 1 and torch.cuda.is_available(): + device = "cuda" + else: + device = "cpu" + + # 1. Set random-seed + set_all_random_seed(seed) + text2punc = Text2Punc(train_config, model_file, device) + + def _forward( + data_path_and_name_and_type, + raw_inputs: Union[List[Any], bytes, str] = None, + output_dir_v2: Optional[str] = None, + cache: List[Any] = None, + param_dict: dict = None, + ): + results = [] + split_size = 20 + + if raw_inputs != None: + line = raw_inputs.strip() + key = "demo" + if line == "": + item = {'key': key, 'value': ""} + results.append(item) + return results + result, _ = text2punc(line) + item = {'key': key, 'value': result} + results.append(item) + return results + + for inference_text, _, _ in data_path_and_name_and_type: + with open(inference_text, "r", encoding="utf-8") as fin: + for line in fin: + line = line.strip() + segs = line.split("\t") + if len(segs) != 2: + continue + key = segs[0] + if len(segs[1]) == 0: + continue + result, _ = text2punc(segs[1]) + item = {'key': key, 'value': result} + results.append(item) + output_path = output_dir_v2 if output_dir_v2 is not None else output_dir + if output_path != None: + output_file_name = "infer.out" + Path(output_path).mkdir(parents=True, exist_ok=True) + output_file_path = (Path(output_path) / output_file_name).absolute() + with open(output_file_path, "w", encoding="utf-8") as fout: + for item_i in results: + key_out = item_i["key"] + value_out = item_i["value"] + fout.write(f"{key_out}\t{value_out}\n") + return results + + return _forward + +def inference_punc_vad_realtime( + batch_size: int, + dtype: str, + ngpu: int, + seed: int, + num_workers: int, + log_level: Union[int, str], + #cache: list, + key_file: Optional[str], + train_config: Optional[str], + model_file: Optional[str], + output_dir: Optional[str] = None, + param_dict: dict = None, + **kwargs, +): + assert check_argument_types() + ncpu = kwargs.get("ncpu", 1) + torch.set_num_threads(ncpu) + + if ngpu >= 1 and torch.cuda.is_available(): + device = "cuda" + else: + device = "cpu" + + # 1. Set random-seed + set_all_random_seed(seed) + text2punc = Text2PuncVADRealtime(train_config, model_file, device) + + def _forward( + data_path_and_name_and_type, + raw_inputs: Union[List[Any], bytes, str] = None, + output_dir_v2: Optional[str] = None, + cache: List[Any] = None, + param_dict: dict = None, + ): + results = [] + split_size = 10 + cache_in = param_dict["cache"] + if raw_inputs != None: + line = raw_inputs.strip() + key = "demo" + if line == "": + item = {'key': key, 'value': ""} + results.append(item) + return results + result, _, cache = text2punc(line, cache_in) + param_dict["cache"] = cache + item = {'key': key, 'value': result} + results.append(item) + return results + + return results + + return _forward + def get_parser(): parser = config_argparse.ArgumentParser( @@ -72,11 +232,9 @@ def get_parser(): def inference_launch(mode, **kwargs): if mode == "punc": - from funasr.bin.punctuation_infer import inference_modelscope - return inference_modelscope(**kwargs) + return inference_punc(**kwargs) if mode == "punc_VadRealtime": - from funasr.bin.punctuation_infer_vadrealtime import inference_modelscope - return inference_modelscope(**kwargs) + return inference_punc_vad_realtime(**kwargs) else: logging.info("Unknown decoding mode: {}".format(mode)) return None @@ -105,7 +263,9 @@ def main(cmd=None): kwargs.pop("gpuid_list", None) kwargs.pop("njob", None) - results = inference_launch(**kwargs) + inference_pipeline = inference_launch(**kwargs) + return inference_pipeline(kwargs["data_path_and_name_and_type"]) + if __name__ == "__main__": diff --git a/funasr/bin/punctuation_infer.py b/funasr/bin/punctuation_infer.py deleted file mode 100644 index 077814d4f..000000000 --- a/funasr/bin/punctuation_infer.py +++ /dev/null @@ -1,320 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import logging -from pathlib import Path -import sys -from typing import Optional -from typing import Sequence -from typing import Tuple -from typing import Union -from typing import Any -from typing import List - -import numpy as np -import torch -from typeguard import check_argument_types - -from funasr.datasets.preprocessor import CodeMixTokenizerCommonPreprocessor -from funasr.utils.cli_utils import get_commandline_args -from funasr.tasks.punctuation import PunctuationTask -from funasr.torch_utils.device_funcs import to_device -from funasr.torch_utils.forward_adaptor import ForwardAdaptor -from funasr.torch_utils.set_all_random_seed import set_all_random_seed -from funasr.utils import config_argparse -from funasr.utils.types import str2triple_str -from funasr.utils.types import str_or_none -from funasr.datasets.preprocessor import split_to_mini_sentence - - -class Text2Punc: - - def __init__( - self, - train_config: Optional[str], - model_file: Optional[str], - device: str = "cpu", - dtype: str = "float32", - ): - # Build Model - model, train_args = PunctuationTask.build_model_from_file(train_config, model_file, device) - self.device = device - # Wrape model to make model.nll() data-parallel - self.wrapped_model = ForwardAdaptor(model, "inference") - self.wrapped_model.to(dtype=getattr(torch, dtype)).to(device=device).eval() - # logging.info(f"Model:\n{model}") - self.punc_list = train_args.punc_list - self.period = 0 - for i in range(len(self.punc_list)): - if self.punc_list[i] == ",": - self.punc_list[i] = "," - elif self.punc_list[i] == "?": - self.punc_list[i] = "?" - elif self.punc_list[i] == "。": - self.period = i - self.preprocessor = CodeMixTokenizerCommonPreprocessor( - train=False, - token_type=train_args.token_type, - token_list=train_args.token_list, - bpemodel=train_args.bpemodel, - text_cleaner=train_args.cleaner, - g2p_type=train_args.g2p, - text_name="text", - non_linguistic_symbols=train_args.non_linguistic_symbols, - ) - - @torch.no_grad() - def __call__(self, text: Union[list, str], split_size=20): - data = {"text": text} - result = self.preprocessor(data=data, uid="12938712838719") - split_text = self.preprocessor.pop_split_text_data(result) - mini_sentences = split_to_mini_sentence(split_text, split_size) - mini_sentences_id = split_to_mini_sentence(data["text"], split_size) - assert len(mini_sentences) == len(mini_sentences_id) - cache_sent = [] - cache_sent_id = torch.from_numpy(np.array([], dtype='int32')) - new_mini_sentence = "" - new_mini_sentence_punc = [] - cache_pop_trigger_limit = 200 - for mini_sentence_i in range(len(mini_sentences)): - mini_sentence = mini_sentences[mini_sentence_i] - mini_sentence_id = mini_sentences_id[mini_sentence_i] - mini_sentence = cache_sent + mini_sentence - mini_sentence_id = np.concatenate((cache_sent_id, mini_sentence_id), axis=0) - data = { - "text": torch.unsqueeze(torch.from_numpy(mini_sentence_id), 0), - "text_lengths": torch.from_numpy(np.array([len(mini_sentence_id)], dtype='int32')), - } - data = to_device(data, self.device) - y, _ = self.wrapped_model(**data) - _, indices = y.view(-1, y.shape[-1]).topk(1, dim=1) - punctuations = indices - if indices.size()[0] != 1: - punctuations = torch.squeeze(indices) - assert punctuations.size()[0] == len(mini_sentence) - - # Search for the last Period/QuestionMark as cache - if mini_sentence_i < len(mini_sentences) - 1: - sentenceEnd = -1 - last_comma_index = -1 - for i in range(len(punctuations) - 2, 1, -1): - if self.punc_list[punctuations[i]] == "。" or self.punc_list[punctuations[i]] == "?": - sentenceEnd = i - break - if last_comma_index < 0 and self.punc_list[punctuations[i]] == ",": - last_comma_index = i - - if sentenceEnd < 0 and len(mini_sentence) > cache_pop_trigger_limit and last_comma_index >= 0: - # The sentence it too long, cut off at a comma. - sentenceEnd = last_comma_index - punctuations[sentenceEnd] = self.period - cache_sent = mini_sentence[sentenceEnd + 1:] - cache_sent_id = mini_sentence_id[sentenceEnd + 1:] - mini_sentence = mini_sentence[0:sentenceEnd + 1] - punctuations = punctuations[0:sentenceEnd + 1] - - # if len(punctuations) == 0: - # continue - - punctuations_np = punctuations.cpu().numpy() - new_mini_sentence_punc += [int(x) for x in punctuations_np] - words_with_punc = [] - for i in range(len(mini_sentence)): - if i > 0: - if len(mini_sentence[i][0].encode()) == 1 and len(mini_sentence[i - 1][0].encode()) == 1: - mini_sentence[i] = " " + mini_sentence[i] - words_with_punc.append(mini_sentence[i]) - if self.punc_list[punctuations[i]] != "_": - words_with_punc.append(self.punc_list[punctuations[i]]) - new_mini_sentence += "".join(words_with_punc) - # Add Period for the end of the sentence - new_mini_sentence_out = new_mini_sentence - new_mini_sentence_punc_out = new_mini_sentence_punc - if mini_sentence_i == len(mini_sentences) - 1: - if new_mini_sentence[-1] == "," or new_mini_sentence[-1] == "、": - new_mini_sentence_out = new_mini_sentence[:-1] + "。" - new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period] - elif new_mini_sentence[-1] != "。" and new_mini_sentence[-1] != "?": - new_mini_sentence_out = new_mini_sentence + "。" - new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period] - return new_mini_sentence_out, new_mini_sentence_punc_out - - -def inference( - batch_size: int, - dtype: str, - ngpu: int, - seed: int, - num_workers: int, - output_dir: str, - log_level: Union[int, str], - train_config: Optional[str], - model_file: Optional[str], - key_file: Optional[str] = None, - data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None, - raw_inputs: Union[List[Any], bytes, str] = None, - cache: List[Any] = None, - param_dict: dict = None, - **kwargs, -): - inference_pipeline = inference_modelscope( - output_dir=output_dir, - batch_size=batch_size, - dtype=dtype, - ngpu=ngpu, - seed=seed, - num_workers=num_workers, - log_level=log_level, - key_file=key_file, - train_config=train_config, - model_file=model_file, - param_dict=param_dict, - **kwargs, - ) - return inference_pipeline(data_path_and_name_and_type, raw_inputs) - - -def inference_modelscope( - batch_size: int, - dtype: str, - ngpu: int, - seed: int, - num_workers: int, - log_level: Union[int, str], - key_file: Optional[str], - train_config: Optional[str], - model_file: Optional[str], - output_dir: Optional[str] = None, - param_dict: dict = None, - **kwargs, -): - assert check_argument_types() - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - - # 1. Set random-seed - set_all_random_seed(seed) - text2punc = Text2Punc(train_config, model_file, device) - - def _forward( - data_path_and_name_and_type, - raw_inputs: Union[List[Any], bytes, str] = None, - output_dir_v2: Optional[str] = None, - cache: List[Any] = None, - param_dict: dict = None, - ): - results = [] - split_size = 20 - - if raw_inputs != None: - line = raw_inputs.strip() - key = "demo" - if line == "": - item = {'key': key, 'value': ""} - results.append(item) - return results - result, _ = text2punc(line) - item = {'key': key, 'value': result} - results.append(item) - return results - - for inference_text, _, _ in data_path_and_name_and_type: - with open(inference_text, "r", encoding="utf-8") as fin: - for line in fin: - line = line.strip() - segs = line.split("\t") - if len(segs) != 2: - continue - key = segs[0] - if len(segs[1]) == 0: - continue - result, _ = text2punc(segs[1]) - item = {'key': key, 'value': result} - results.append(item) - output_path = output_dir_v2 if output_dir_v2 is not None else output_dir - if output_path != None: - output_file_name = "infer.out" - Path(output_path).mkdir(parents=True, exist_ok=True) - output_file_path = (Path(output_path) / output_file_name).absolute() - with open(output_file_path, "w", encoding="utf-8") as fout: - for item_i in results: - key_out = item_i["key"] - value_out = item_i["value"] - fout.write(f"{key_out}\t{value_out}\n") - return results - - return _forward - - -def get_parser(): - parser = config_argparse.ArgumentParser( - description="Punctuation inference", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - parser.add_argument( - "--log_level", - type=lambda x: x.upper(), - default="INFO", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), - help="The verbose level of logging", - ) - - parser.add_argument("--output_dir", type=str, required=False) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - parser.add_argument("--seed", type=int, default=0, help="Random seed") - parser.add_argument( - "--dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type", - ) - parser.add_argument( - "--num_workers", - type=int, - default=1, - help="The number of workers used for DataLoader", - ) - parser.add_argument( - "--batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - - group = parser.add_argument_group("Input data related") - group.add_argument("--data_path_and_name_and_type", type=str2triple_str, action="append", required=False) - group.add_argument("--raw_inputs", type=str, required=False) - group.add_argument("--cache", type=list, required=False) - group.add_argument("--param_dict", type=dict, required=False) - group.add_argument("--key_file", type=str_or_none) - - group = parser.add_argument_group("The model configuration related") - group.add_argument("--train_config", type=str) - group.add_argument("--model_file", type=str) - - return parser - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - parser = get_parser() - args = parser.parse_args(cmd) - kwargs = vars(args) - # kwargs.pop("config", None) - inference(**kwargs) - - -if __name__ == "__main__": - main() diff --git a/funasr/bin/sond_inference.py b/funasr/bin/sond_inference.py deleted file mode 100755 index c55bc3544..000000000 --- a/funasr/bin/sond_inference.py +++ /dev/null @@ -1,577 +0,0 @@ -#!/usr/bin/env python3 -# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. -# MIT License (https://opensource.org/licenses/MIT) - -import argparse -import logging -import os -import sys -from pathlib import Path -from typing import Any -from typing import List -from typing import Optional -from typing import Sequence -from typing import Tuple -from typing import Union - -from collections import OrderedDict -import numpy as np -import soundfile -import torch -from torch.nn import functional as F -from typeguard import check_argument_types -from typeguard import check_return_type - -from funasr.utils.cli_utils import get_commandline_args -from funasr.tasks.diar import DiarTask -from funasr.tasks.asr import ASRTask -from funasr.torch_utils.device_funcs import to_device -from funasr.torch_utils.set_all_random_seed import set_all_random_seed -from funasr.utils import config_argparse -from funasr.utils.types import str2bool -from funasr.utils.types import str2triple_str -from funasr.utils.types import str_or_none -from scipy.ndimage import median_filter -from funasr.utils.misc import statistic_model_parameters -from funasr.datasets.iterable_dataset import load_bytes - - -class Speech2Diarization: - """Speech2Xvector class - - Examples: - >>> import soundfile - >>> import numpy as np - >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb") - >>> profile = np.load("profiles.npy") - >>> audio, rate = soundfile.read("speech.wav") - >>> speech2diar(audio, profile) - {"spk1": [(int, int), ...], ...} - - """ - - def __init__( - self, - diar_train_config: Union[Path, str] = None, - diar_model_file: Union[Path, str] = None, - device: Union[str, torch.device] = "cpu", - batch_size: int = 1, - dtype: str = "float32", - streaming: bool = False, - smooth_size: int = 83, - dur_threshold: float = 10, - ): - assert check_argument_types() - - # TODO: 1. Build Diarization model - diar_model, diar_train_args = DiarTask.build_model_from_file( - config_file=diar_train_config, - model_file=diar_model_file, - device=device - ) - logging.info("diar_model: {}".format(diar_model)) - logging.info("model parameter number: {}".format(statistic_model_parameters(diar_model))) - logging.info("diar_train_args: {}".format(diar_train_args)) - diar_model.to(dtype=getattr(torch, dtype)).eval() - - self.diar_model = diar_model - self.diar_train_args = diar_train_args - self.token_list = diar_train_args.token_list - self.smooth_size = smooth_size - self.dur_threshold = dur_threshold - self.device = device - self.dtype = dtype - - def smooth_multi_labels(self, multi_label): - multi_label = median_filter(multi_label, (self.smooth_size, 1), mode="constant", cval=0.0).astype(int) - return multi_label - - @staticmethod - def calc_spk_turns(label_arr, spk_list): - turn_list = [] - length = label_arr.shape[0] - n_spk = label_arr.shape[1] - for k in range(n_spk): - if spk_list[k] == "None": - continue - in_utt = False - start = 0 - for i in range(length): - if label_arr[i, k] == 1 and in_utt is False: - start = i - in_utt = True - if label_arr[i, k] == 0 and in_utt is True: - turn_list.append([spk_list[k], start, i - start]) - in_utt = False - if in_utt: - turn_list.append([spk_list[k], start, length - start]) - return turn_list - - @staticmethod - def seq2arr(seq, vec_dim=8): - def int2vec(x, vec_dim=8, dtype=np.int): - b = ('{:0' + str(vec_dim) + 'b}').format(x) - # little-endian order: lower bit first - return (np.array(list(b)[::-1]) == '1').astype(dtype) - - # process oov - seq = np.array([int(x) for x in seq]) - new_seq = [] - for i, x in enumerate(seq): - if x < 2 ** vec_dim: - new_seq.append(x) - else: - idx_list = np.where(seq < 2 ** vec_dim)[0] - idx = np.abs(idx_list - i).argmin() - new_seq.append(seq[idx_list[idx]]) - return np.row_stack([int2vec(x, vec_dim) for x in new_seq]) - - def post_processing(self, raw_logits: torch.Tensor, spk_num: int, output_format: str = "speaker_turn"): - logits_idx = raw_logits.argmax(-1) # B, T, vocab_size -> B, T - # upsampling outputs to match inputs - ut = logits_idx.shape[1] * self.diar_model.encoder.time_ds_ratio - logits_idx = F.upsample( - logits_idx.unsqueeze(1).float(), - size=(ut, ), - mode="nearest", - ).squeeze(1).long() - logits_idx = logits_idx[0].tolist() - pse_labels = [self.token_list[x] for x in logits_idx] - if output_format == "pse_labels": - return pse_labels, None - - multi_labels = self.seq2arr(pse_labels, spk_num)[:, :spk_num] # remove padding speakers - multi_labels = self.smooth_multi_labels(multi_labels) - if output_format == "binary_labels": - return multi_labels, None - - spk_list = ["spk{}".format(i + 1) for i in range(spk_num)] - spk_turns = self.calc_spk_turns(multi_labels, spk_list) - results = OrderedDict() - for spk, st, dur in spk_turns: - if spk not in results: - results[spk] = [] - if dur > self.dur_threshold: - results[spk].append((st, st+dur)) - - # sort segments in start time ascending - for spk in results: - results[spk] = sorted(results[spk], key=lambda x: x[0]) - - return results, pse_labels - - @torch.no_grad() - def __call__( - self, - speech: Union[torch.Tensor, np.ndarray], - profile: Union[torch.Tensor, np.ndarray], - output_format: str = "speaker_turn" - ): - """Inference - - Args: - speech: Input speech data - profile: Speaker profiles - Returns: - diarization results for each speaker - - """ - assert check_argument_types() - # Input as audio signal - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - if isinstance(profile, np.ndarray): - profile = torch.tensor(profile) - - # data: (Nsamples,) -> (1, Nsamples) - speech = speech.unsqueeze(0).to(getattr(torch, self.dtype)) - profile = profile.unsqueeze(0).to(getattr(torch, self.dtype)) - # lengths: (1,) - speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1)) - profile_lengths = profile.new_full([1], dtype=torch.long, fill_value=profile.size(1)) - batch = {"speech": speech, "speech_lengths": speech_lengths, - "profile": profile, "profile_lengths": profile_lengths} - # a. To device - batch = to_device(batch, device=self.device) - - logits = self.diar_model.prediction_forward(**batch) - results, pse_labels = self.post_processing(logits, profile.shape[1], output_format) - - return results, pse_labels - - @staticmethod - def from_pretrained( - model_tag: Optional[str] = None, - **kwargs: Optional[Any], - ): - """Build Speech2Xvector instance from the pretrained model. - - Args: - model_tag (Optional[str]): Model tag of the pretrained models. - Currently, the tags of espnet_model_zoo are supported. - - Returns: - Speech2Xvector: Speech2Xvector instance. - - """ - if model_tag is not None: - try: - from espnet_model_zoo.downloader import ModelDownloader - - except ImportError: - logging.error( - "`espnet_model_zoo` is not installed. " - "Please install via `pip install -U espnet_model_zoo`." - ) - raise - d = ModelDownloader() - kwargs.update(**d.download_and_unpack(model_tag)) - - return Speech2Diarization(**kwargs) - - -def inference_modelscope( - diar_train_config: str, - diar_model_file: str, - output_dir: Optional[str] = None, - batch_size: int = 1, - dtype: str = "float32", - ngpu: int = 0, - seed: int = 0, - num_workers: int = 0, - log_level: Union[int, str] = "INFO", - key_file: Optional[str] = None, - model_tag: Optional[str] = None, - allow_variable_data_keys: bool = True, - streaming: bool = False, - smooth_size: int = 83, - dur_threshold: int = 10, - out_format: str = "vad", - param_dict: Optional[dict] = None, - mode: str = "sond", - **kwargs, -): - assert check_argument_types() - ncpu = kwargs.get("ncpu", 1) - torch.set_num_threads(ncpu) - if batch_size > 1: - raise NotImplementedError("batch decoding is not implemented") - if ngpu > 1: - raise NotImplementedError("only single GPU decoding is supported") - - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - logging.info("param_dict: {}".format(param_dict)) - - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - - # 1. Set random-seed - set_all_random_seed(seed) - - # 2a. Build speech2xvec [Optional] - if mode == "sond_demo" and param_dict is not None and "extract_profile" in param_dict and param_dict["extract_profile"]: - assert "sv_train_config" in param_dict, "sv_train_config must be provided param_dict." - assert "sv_model_file" in param_dict, "sv_model_file must be provided in param_dict." - sv_train_config = param_dict["sv_train_config"] - sv_model_file = param_dict["sv_model_file"] - if "model_dir" in param_dict: - sv_train_config = os.path.join(param_dict["model_dir"], sv_train_config) - sv_model_file = os.path.join(param_dict["model_dir"], sv_model_file) - from funasr.bin.sv_inference import Speech2Xvector - speech2xvector_kwargs = dict( - sv_train_config=sv_train_config, - sv_model_file=sv_model_file, - device=device, - dtype=dtype, - streaming=streaming, - embedding_node="resnet1_dense" - ) - logging.info("speech2xvector_kwargs: {}".format(speech2xvector_kwargs)) - speech2xvector = Speech2Xvector.from_pretrained( - model_tag=model_tag, - **speech2xvector_kwargs, - ) - speech2xvector.sv_model.eval() - - # 2b. Build speech2diar - speech2diar_kwargs = dict( - diar_train_config=diar_train_config, - diar_model_file=diar_model_file, - device=device, - dtype=dtype, - streaming=streaming, - smooth_size=smooth_size, - dur_threshold=dur_threshold, - ) - logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs)) - speech2diar = Speech2Diarization.from_pretrained( - model_tag=model_tag, - **speech2diar_kwargs, - ) - speech2diar.diar_model.eval() - - def output_results_str(results: dict, uttid: str): - rst = [] - mid = uttid.rsplit("-", 1)[0] - for key in results: - results[key] = [(x[0]/100, x[1]/100) for x in results[key]] - if out_format == "vad": - for spk, segs in results.items(): - rst.append("{} {}".format(spk, segs)) - else: - template = "SPEAKER {} 0 {:.2f} {:.2f} {} " - for spk, segs in results.items(): - rst.extend([template.format(mid, st, ed, spk) for st, ed in segs]) - - return "\n".join(rst) - - def _forward( - data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None, - raw_inputs: List[List[Union[np.ndarray, torch.Tensor, str, bytes]]] = None, - output_dir_v2: Optional[str] = None, - param_dict: Optional[dict] = None, - ): - logging.info("param_dict: {}".format(param_dict)) - if data_path_and_name_and_type is None and raw_inputs is not None: - if isinstance(raw_inputs, (list, tuple)): - if not isinstance(raw_inputs[0], List): - raw_inputs = [raw_inputs] - - assert all([len(example) >= 2 for example in raw_inputs]), \ - "The length of test case in raw_inputs must larger than 1 (>=2)." - - def prepare_dataset(): - for idx, example in enumerate(raw_inputs): - # read waveform file - example = [load_bytes(x) if isinstance(x, bytes) else x - for x in example] - example = [soundfile.read(x)[0] if isinstance(x, str) else x - for x in example] - # convert torch tensor to numpy array - example = [x.numpy() if isinstance(example[0], torch.Tensor) else x - for x in example] - speech = example[0] - logging.info("Extracting profiles for {} waveforms".format(len(example)-1)) - profile = [speech2xvector.calculate_embedding(x) for x in example[1:]] - profile = torch.cat(profile, dim=0) - yield ["test{}".format(idx)], {"speech": [speech], "profile": [profile]} - - loader = prepare_dataset() - else: - raise TypeError("raw_inputs must be a list or tuple in [speech, profile1, profile2, ...] ") - else: - # 3. Build data-iterator - loader = ASRTask.build_streaming_iterator( - data_path_and_name_and_type, - dtype=dtype, - batch_size=batch_size, - key_file=key_file, - num_workers=num_workers, - preprocess_fn=None, - collate_fn=None, - allow_variable_data_keys=allow_variable_data_keys, - inference=True, - ) - - # 7. Start for-loop - output_path = output_dir_v2 if output_dir_v2 is not None else output_dir - if output_path is not None: - os.makedirs(output_path, exist_ok=True) - output_writer = open("{}/result.txt".format(output_path), "w") - pse_label_writer = open("{}/labels.txt".format(output_path), "w") - logging.info("Start to diarize...") - result_list = [] - for idx, (keys, batch) in enumerate(loader): - assert isinstance(batch, dict), type(batch) - assert all(isinstance(s, str) for s in keys), keys - _bs = len(next(iter(batch.values()))) - assert len(keys) == _bs, f"{len(keys)} != {_bs}" - batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} - - results, pse_labels = speech2diar(**batch) - # Only supporting batch_size==1 - key, value = keys[0], output_results_str(results, keys[0]) - item = {"key": key, "value": value} - result_list.append(item) - if output_path is not None: - output_writer.write(value) - output_writer.flush() - pse_label_writer.write("{} {}\n".format(key, " ".join(pse_labels))) - pse_label_writer.flush() - - if idx % 100 == 0: - logging.info("Processing {:5d}: {}".format(idx, key)) - - if output_path is not None: - output_writer.close() - pse_label_writer.close() - - return result_list - - return _forward - - -def inference( - data_path_and_name_and_type: Sequence[Tuple[str, str, str]], - diar_train_config: Optional[str], - diar_model_file: Optional[str], - output_dir: Optional[str] = None, - batch_size: int = 1, - dtype: str = "float32", - ngpu: int = 0, - seed: int = 0, - num_workers: int = 1, - log_level: Union[int, str] = "INFO", - key_file: Optional[str] = None, - model_tag: Optional[str] = None, - allow_variable_data_keys: bool = True, - streaming: bool = False, - smooth_size: int = 83, - dur_threshold: int = 10, - out_format: str = "vad", - **kwargs, -): - inference_pipeline = inference_modelscope( - diar_train_config=diar_train_config, - diar_model_file=diar_model_file, - output_dir=output_dir, - batch_size=batch_size, - dtype=dtype, - ngpu=ngpu, - seed=seed, - num_workers=num_workers, - log_level=log_level, - key_file=key_file, - model_tag=model_tag, - allow_variable_data_keys=allow_variable_data_keys, - streaming=streaming, - smooth_size=smooth_size, - dur_threshold=dur_threshold, - out_format=out_format, - **kwargs, - ) - - return inference_pipeline(data_path_and_name_and_type, raw_inputs=None) - - -def get_parser(): - parser = config_argparse.ArgumentParser( - description="Speaker verification/x-vector extraction", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Note(kamo): Use '_' instead of '-' as separator. - # '-' is confusing if written in yaml. - parser.add_argument( - "--log_level", - type=lambda x: x.upper(), - default="INFO", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), - help="The verbose level of logging", - ) - - parser.add_argument("--output_dir", type=str, required=False) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - parser.add_argument( - "--gpuid_list", - type=str, - default="", - help="The visible gpus", - ) - parser.add_argument("--seed", type=int, default=0, help="Random seed") - parser.add_argument( - "--dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type", - ) - parser.add_argument( - "--num_workers", - type=int, - default=1, - help="The number of workers used for DataLoader", - ) - - group = parser.add_argument_group("Input data related") - group.add_argument( - "--data_path_and_name_and_type", - type=str2triple_str, - required=False, - action="append", - ) - group.add_argument("--key_file", type=str_or_none) - group.add_argument("--allow_variable_data_keys", type=str2bool, default=False) - - group = parser.add_argument_group("The model configuration related") - group.add_argument( - "--diar_train_config", - type=str, - help="diarization training configuration", - ) - group.add_argument( - "--diar_model_file", - type=str, - help="diarization model parameter file", - ) - group.add_argument( - "--dur_threshold", - type=int, - default=10, - help="The threshold for short segments in number frames" - ) - parser.add_argument( - "--smooth_size", - type=int, - default=83, - help="The smoothing window length in number frames" - ) - group.add_argument( - "--model_tag", - type=str, - help="Pretrained model tag. If specify this option, *_train_config and " - "*_file will be overwritten", - ) - parser.add_argument( - "--batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - parser.add_argument("--streaming", type=str2bool, default=False) - - return parser - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - parser = get_parser() - args = parser.parse_args(cmd) - kwargs = vars(args) - kwargs.pop("config", None) - logging.info("args: {}".format(kwargs)) - if args.output_dir is None: - jobid, n_gpu = 1, 1 - gpuid = args.gpuid_list.split(",")[jobid-1] - else: - jobid = int(args.output_dir.split(".")[-1]) - n_gpu = len(args.gpuid_list.split(",")) - gpuid = args.gpuid_list.split(",")[(jobid - 1) % n_gpu] - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = gpuid - results_list = inference(**kwargs) - for results in results_list: - print("{} {}".format(results["key"], results["value"])) - - -if __name__ == "__main__": - main() diff --git a/funasr/bin/sv_infer.py b/funasr/bin/sv_infer.py new file mode 100755 index 000000000..8a9c6e9f3 --- /dev/null +++ b/funasr/bin/sv_infer.py @@ -0,0 +1,163 @@ +#!/usr/bin/env python3 +# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. +# MIT License (https://opensource.org/licenses/MIT) + +import argparse +import logging +import os +import sys +from pathlib import Path +from typing import Any +from typing import List +from typing import Optional +from typing import Sequence +from typing import Tuple +from typing import Union + +import numpy as np +import torch +from kaldiio import WriteHelper +from typeguard import check_argument_types +from typeguard import check_return_type + +from funasr.utils.cli_utils import get_commandline_args +from funasr.tasks.sv import SVTask +from funasr.tasks.asr import ASRTask +from funasr.torch_utils.device_funcs import to_device +from funasr.torch_utils.set_all_random_seed import set_all_random_seed +from funasr.utils import config_argparse +from funasr.utils.types import str2bool +from funasr.utils.types import str2triple_str +from funasr.utils.types import str_or_none +from funasr.utils.misc import statistic_model_parameters + +class Speech2Xvector: + """Speech2Xvector class + + Examples: + >>> import soundfile + >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pb") + >>> audio, rate = soundfile.read("speech.wav") + >>> speech2xvector(audio) + [(text, token, token_int, hypothesis object), ...] + + """ + + def __init__( + self, + sv_train_config: Union[Path, str] = None, + sv_model_file: Union[Path, str] = None, + device: str = "cpu", + batch_size: int = 1, + dtype: str = "float32", + streaming: bool = False, + embedding_node: str = "resnet1_dense", + ): + assert check_argument_types() + + # TODO: 1. Build SV model + sv_model, sv_train_args = SVTask.build_model_from_file( + config_file=sv_train_config, + model_file=sv_model_file, + device=device + ) + logging.info("sv_model: {}".format(sv_model)) + logging.info("model parameter number: {}".format(statistic_model_parameters(sv_model))) + logging.info("sv_train_args: {}".format(sv_train_args)) + sv_model.to(dtype=getattr(torch, dtype)).eval() + + self.sv_model = sv_model + self.sv_train_args = sv_train_args + self.device = device + self.dtype = dtype + self.embedding_node = embedding_node + + @torch.no_grad() + def calculate_embedding(self, speech: Union[torch.Tensor, np.ndarray]) -> torch.Tensor: + # Input as audio signal + if isinstance(speech, np.ndarray): + speech = torch.tensor(speech) + + # data: (Nsamples,) -> (1, Nsamples) + speech = speech.unsqueeze(0).to(getattr(torch, self.dtype)) + # lengths: (1,) + lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1)) + batch = {"speech": speech, "speech_lengths": lengths} + + # a. To device + batch = to_device(batch, device=self.device) + + # b. Forward Encoder + enc, ilens = self.sv_model.encode(**batch) + + # c. Forward Pooling + pooling = self.sv_model.pooling_layer(enc) + + # d. Forward Decoder + outputs, embeddings = self.sv_model.decoder(pooling) + + if self.embedding_node not in embeddings: + raise ValueError("Required embedding node {} not in {}".format( + self.embedding_node, embeddings.keys())) + + return embeddings[self.embedding_node] + + @torch.no_grad() + def __call__( + self, speech: Union[torch.Tensor, np.ndarray], + ref_speech: Optional[Union[torch.Tensor, np.ndarray]] = None, + ) -> Tuple[torch.Tensor, Union[torch.Tensor, None], Union[torch.Tensor, None]]: + """Inference + + Args: + speech: Input speech data + ref_speech: Reference speech to compare + Returns: + embedding, ref_embedding, similarity_score + + """ + assert check_argument_types() + self.sv_model.eval() + embedding = self.calculate_embedding(speech) + ref_emb, score = None, None + if ref_speech is not None: + ref_emb = self.calculate_embedding(ref_speech) + score = torch.cosine_similarity(embedding, ref_emb) + + results = (embedding, ref_emb, score) + assert check_return_type(results) + return results + + @staticmethod + def from_pretrained( + model_tag: Optional[str] = None, + **kwargs: Optional[Any], + ): + """Build Speech2Xvector instance from the pretrained model. + + Args: + model_tag (Optional[str]): Model tag of the pretrained models. + Currently, the tags of espnet_model_zoo are supported. + + Returns: + Speech2Xvector: Speech2Xvector instance. + + """ + if model_tag is not None: + try: + from espnet_model_zoo.downloader import ModelDownloader + + except ImportError: + logging.error( + "`espnet_model_zoo` is not installed. " + "Please install via `pip install -U espnet_model_zoo`." + ) + raise + d = ModelDownloader() + kwargs.update(**d.download_and_unpack(model_tag)) + + return Speech2Xvector(**kwargs) + + + + diff --git a/funasr/bin/sv_inference.py b/funasr/bin/sv_inference.py deleted file mode 100755 index 76b1dfbb8..000000000 --- a/funasr/bin/sv_inference.py +++ /dev/null @@ -1,443 +0,0 @@ -#!/usr/bin/env python3 -# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. -# MIT License (https://opensource.org/licenses/MIT) - -import argparse -import logging -import os -import sys -from pathlib import Path -from typing import Any -from typing import List -from typing import Optional -from typing import Sequence -from typing import Tuple -from typing import Union - -import numpy as np -import torch -from kaldiio import WriteHelper -from typeguard import check_argument_types -from typeguard import check_return_type - -from funasr.utils.cli_utils import get_commandline_args -from funasr.tasks.sv import SVTask -from funasr.tasks.asr import ASRTask -from funasr.torch_utils.device_funcs import to_device -from funasr.torch_utils.set_all_random_seed import set_all_random_seed -from funasr.utils import config_argparse -from funasr.utils.types import str2bool -from funasr.utils.types import str2triple_str -from funasr.utils.types import str_or_none -from funasr.utils.misc import statistic_model_parameters - -class Speech2Xvector: - """Speech2Xvector class - - Examples: - >>> import soundfile - >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pb") - >>> audio, rate = soundfile.read("speech.wav") - >>> speech2xvector(audio) - [(text, token, token_int, hypothesis object), ...] - - """ - - def __init__( - self, - sv_train_config: Union[Path, str] = None, - sv_model_file: Union[Path, str] = None, - device: str = "cpu", - batch_size: int = 1, - dtype: str = "float32", - streaming: bool = False, - embedding_node: str = "resnet1_dense", - ): - assert check_argument_types() - - # TODO: 1. Build SV model - sv_model, sv_train_args = SVTask.build_model_from_file( - config_file=sv_train_config, - model_file=sv_model_file, - device=device - ) - logging.info("sv_model: {}".format(sv_model)) - logging.info("model parameter number: {}".format(statistic_model_parameters(sv_model))) - logging.info("sv_train_args: {}".format(sv_train_args)) - sv_model.to(dtype=getattr(torch, dtype)).eval() - - self.sv_model = sv_model - self.sv_train_args = sv_train_args - self.device = device - self.dtype = dtype - self.embedding_node = embedding_node - - @torch.no_grad() - def calculate_embedding(self, speech: Union[torch.Tensor, np.ndarray]) -> torch.Tensor: - # Input as audio signal - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - - # data: (Nsamples,) -> (1, Nsamples) - speech = speech.unsqueeze(0).to(getattr(torch, self.dtype)) - # lengths: (1,) - lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1)) - batch = {"speech": speech, "speech_lengths": lengths} - - # a. To device - batch = to_device(batch, device=self.device) - - # b. Forward Encoder - enc, ilens = self.sv_model.encode(**batch) - - # c. Forward Pooling - pooling = self.sv_model.pooling_layer(enc) - - # d. Forward Decoder - outputs, embeddings = self.sv_model.decoder(pooling) - - if self.embedding_node not in embeddings: - raise ValueError("Required embedding node {} not in {}".format( - self.embedding_node, embeddings.keys())) - - return embeddings[self.embedding_node] - - @torch.no_grad() - def __call__( - self, speech: Union[torch.Tensor, np.ndarray], - ref_speech: Optional[Union[torch.Tensor, np.ndarray]] = None, - ) -> Tuple[torch.Tensor, Union[torch.Tensor, None], Union[torch.Tensor, None]]: - """Inference - - Args: - speech: Input speech data - ref_speech: Reference speech to compare - Returns: - embedding, ref_embedding, similarity_score - - """ - assert check_argument_types() - self.sv_model.eval() - embedding = self.calculate_embedding(speech) - ref_emb, score = None, None - if ref_speech is not None: - ref_emb = self.calculate_embedding(ref_speech) - score = torch.cosine_similarity(embedding, ref_emb) - - results = (embedding, ref_emb, score) - assert check_return_type(results) - return results - - @staticmethod - def from_pretrained( - model_tag: Optional[str] = None, - **kwargs: Optional[Any], - ): - """Build Speech2Xvector instance from the pretrained model. - - Args: - model_tag (Optional[str]): Model tag of the pretrained models. - Currently, the tags of espnet_model_zoo are supported. - - Returns: - Speech2Xvector: Speech2Xvector instance. - - """ - if model_tag is not None: - try: - from espnet_model_zoo.downloader import ModelDownloader - - except ImportError: - logging.error( - "`espnet_model_zoo` is not installed. " - "Please install via `pip install -U espnet_model_zoo`." - ) - raise - d = ModelDownloader() - kwargs.update(**d.download_and_unpack(model_tag)) - - return Speech2Xvector(**kwargs) - - -def inference_modelscope( - output_dir: Optional[str] = None, - batch_size: int = 1, - dtype: str = "float32", - ngpu: int = 1, - seed: int = 0, - num_workers: int = 0, - log_level: Union[int, str] = "INFO", - key_file: Optional[str] = None, - sv_train_config: Optional[str] = "sv.yaml", - sv_model_file: Optional[str] = "sv.pb", - model_tag: Optional[str] = None, - allow_variable_data_keys: bool = True, - streaming: bool = False, - embedding_node: str = "resnet1_dense", - sv_threshold: float = 0.9465, - param_dict: Optional[dict] = None, - **kwargs, -): - assert check_argument_types() - ncpu = kwargs.get("ncpu", 1) - torch.set_num_threads(ncpu) - - if batch_size > 1: - raise NotImplementedError("batch decoding is not implemented") - if ngpu > 1: - raise NotImplementedError("only single GPU decoding is supported") - - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - logging.info("param_dict: {}".format(param_dict)) - - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - - # 1. Set random-seed - set_all_random_seed(seed) - - # 2. Build speech2xvector - speech2xvector_kwargs = dict( - sv_train_config=sv_train_config, - sv_model_file=sv_model_file, - device=device, - dtype=dtype, - streaming=streaming, - embedding_node=embedding_node - ) - logging.info("speech2xvector_kwargs: {}".format(speech2xvector_kwargs)) - speech2xvector = Speech2Xvector.from_pretrained( - model_tag=model_tag, - **speech2xvector_kwargs, - ) - speech2xvector.sv_model.eval() - - def _forward( - data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - output_dir_v2: Optional[str] = None, - param_dict: Optional[dict] = None, - ): - logging.info("param_dict: {}".format(param_dict)) - if data_path_and_name_and_type is None and raw_inputs is not None: - if isinstance(raw_inputs, torch.Tensor): - raw_inputs = raw_inputs.numpy() - data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] - - # 3. Build data-iterator - loader = ASRTask.build_streaming_iterator( - data_path_and_name_and_type, - dtype=dtype, - batch_size=batch_size, - key_file=key_file, - num_workers=num_workers, - preprocess_fn=None, - collate_fn=None, - allow_variable_data_keys=allow_variable_data_keys, - inference=True, - ) - - # 7 .Start for-loop - output_path = output_dir_v2 if output_dir_v2 is not None else output_dir - embd_writer, ref_embd_writer, score_writer = None, None, None - if output_path is not None: - os.makedirs(output_path, exist_ok=True) - embd_writer = WriteHelper("ark,scp:{}/xvector.ark,{}/xvector.scp".format(output_path, output_path)) - sv_result_list = [] - for keys, batch in loader: - assert isinstance(batch, dict), type(batch) - assert all(isinstance(s, str) for s in keys), keys - _bs = len(next(iter(batch.values()))) - assert len(keys) == _bs, f"{len(keys)} != {_bs}" - batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} - - embedding, ref_embedding, score = speech2xvector(**batch) - # Only supporting batch_size==1 - key = keys[0] - normalized_score = 0.0 - if score is not None: - score = score.item() - normalized_score = max(score - sv_threshold, 0.0) / (1.0 - sv_threshold) * 100.0 - item = {"key": key, "value": normalized_score} - else: - item = {"key": key, "value": embedding.squeeze(0).cpu().numpy()} - sv_result_list.append(item) - if output_path is not None: - embd_writer(key, embedding[0].cpu().numpy()) - if ref_embedding is not None: - if ref_embd_writer is None: - ref_embd_writer = WriteHelper( - "ark,scp:{}/ref_xvector.ark,{}/ref_xvector.scp".format(output_path, output_path) - ) - score_writer = open(os.path.join(output_path, "score.txt"), "w") - ref_embd_writer(key, ref_embedding[0].cpu().numpy()) - score_writer.write("{} {:.6f}\n".format(key, normalized_score)) - - if output_path is not None: - embd_writer.close() - if ref_embd_writer is not None: - ref_embd_writer.close() - score_writer.close() - - return sv_result_list - - return _forward - - -def inference( - output_dir: Optional[str], - batch_size: int, - dtype: str, - ngpu: int, - seed: int, - num_workers: int, - log_level: Union[int, str], - data_path_and_name_and_type: Sequence[Tuple[str, str, str]], - key_file: Optional[str], - sv_train_config: Optional[str], - sv_model_file: Optional[str], - model_tag: Optional[str], - allow_variable_data_keys: bool = True, - streaming: bool = False, - embedding_node: str = "resnet1_dense", - sv_threshold: float = 0.9465, - **kwargs, -): - inference_pipeline = inference_modelscope( - output_dir=output_dir, - batch_size=batch_size, - dtype=dtype, - ngpu=ngpu, - seed=seed, - num_workers=num_workers, - log_level=log_level, - key_file=key_file, - sv_train_config=sv_train_config, - sv_model_file=sv_model_file, - model_tag=model_tag, - allow_variable_data_keys=allow_variable_data_keys, - streaming=streaming, - embedding_node=embedding_node, - sv_threshold=sv_threshold, - **kwargs, - ) - - return inference_pipeline(data_path_and_name_and_type, raw_inputs=None) - - -def get_parser(): - parser = config_argparse.ArgumentParser( - description="Speaker verification/x-vector extraction", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Note(kamo): Use '_' instead of '-' as separator. - # '-' is confusing if written in yaml. - parser.add_argument( - "--log_level", - type=lambda x: x.upper(), - default="INFO", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), - help="The verbose level of logging", - ) - - parser.add_argument("--output_dir", type=str, required=False) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - parser.add_argument( - "--gpuid_list", - type=str, - default="", - help="The visible gpus", - ) - parser.add_argument("--seed", type=int, default=0, help="Random seed") - parser.add_argument( - "--dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type", - ) - parser.add_argument( - "--num_workers", - type=int, - default=1, - help="The number of workers used for DataLoader", - ) - - group = parser.add_argument_group("Input data related") - group.add_argument( - "--data_path_and_name_and_type", - type=str2triple_str, - required=False, - action="append", - ) - group.add_argument("--key_file", type=str_or_none) - group.add_argument("--allow_variable_data_keys", type=str2bool, default=False) - - group = parser.add_argument_group("The model configuration related") - group.add_argument( - "--sv_train_config", - type=str, - help="SV training configuration", - ) - group.add_argument( - "--sv_model_file", - type=str, - help="SV model parameter file", - ) - group.add_argument( - "--sv_threshold", - type=float, - default=0.9465, - help="The threshold for verification" - ) - group.add_argument( - "--model_tag", - type=str, - help="Pretrained model tag. If specify this option, *_train_config and " - "*_file will be overwritten", - ) - parser.add_argument( - "--batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - parser.add_argument("--streaming", type=str2bool, default=False) - parser.add_argument("--embedding_node", type=str, default="resnet1_dense") - - return parser - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - parser = get_parser() - args = parser.parse_args(cmd) - kwargs = vars(args) - kwargs.pop("config", None) - logging.info("args: {}".format(kwargs)) - if args.output_dir is None: - jobid, n_gpu = 1, 1 - gpuid = args.gpuid_list.split(",")[jobid-1] - else: - jobid = int(args.output_dir.split(".")[-1]) - n_gpu = len(args.gpuid_list.split(",")) - gpuid = args.gpuid_list.split(",")[(jobid - 1) % n_gpu] - os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" - os.environ["CUDA_VISIBLE_DEVICES"] = gpuid - results_list = inference(**kwargs) - for results in results_list: - print("{} {}".format(results["key"], results["value"])) - - -if __name__ == "__main__": - main() diff --git a/funasr/bin/sv_inference_launch.py b/funasr/bin/sv_inference_launch.py index 880607013..24b86386f 100755 --- a/funasr/bin/sv_inference_launch.py +++ b/funasr/bin/sv_inference_launch.py @@ -14,6 +14,164 @@ from funasr.utils.cli_utils import get_commandline_args from funasr.utils.types import str2bool from funasr.utils.types import str2triple_str from funasr.utils.types import str_or_none +import argparse +import logging +import os +import sys +from pathlib import Path +from typing import Any +from typing import List +from typing import Optional +from typing import Sequence +from typing import Tuple +from typing import Union + +import numpy as np +import torch +from kaldiio import WriteHelper +from typeguard import check_argument_types +from typeguard import check_return_type + +from funasr.utils.cli_utils import get_commandline_args +from funasr.tasks.sv import SVTask +from funasr.tasks.asr import ASRTask +from funasr.torch_utils.device_funcs import to_device +from funasr.torch_utils.set_all_random_seed import set_all_random_seed +from funasr.utils import config_argparse +from funasr.utils.types import str2bool +from funasr.utils.types import str2triple_str +from funasr.utils.types import str_or_none +from funasr.utils.misc import statistic_model_parameters +from funasr.bin.sv_infer import Speech2Xvector + +def inference_sv( + output_dir: Optional[str] = None, + batch_size: int = 1, + dtype: str = "float32", + ngpu: int = 1, + seed: int = 0, + num_workers: int = 0, + log_level: Union[int, str] = "INFO", + key_file: Optional[str] = None, + sv_train_config: Optional[str] = "sv.yaml", + sv_model_file: Optional[str] = "sv.pb", + model_tag: Optional[str] = None, + allow_variable_data_keys: bool = True, + streaming: bool = False, + embedding_node: str = "resnet1_dense", + sv_threshold: float = 0.9465, + param_dict: Optional[dict] = None, + **kwargs, +): + assert check_argument_types() + ncpu = kwargs.get("ncpu", 1) + torch.set_num_threads(ncpu) + + if batch_size > 1: + raise NotImplementedError("batch decoding is not implemented") + if ngpu > 1: + raise NotImplementedError("only single GPU decoding is supported") + + logging.basicConfig( + level=log_level, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + logging.info("param_dict: {}".format(param_dict)) + + if ngpu >= 1 and torch.cuda.is_available(): + device = "cuda" + else: + device = "cpu" + + # 1. Set random-seed + set_all_random_seed(seed) + + # 2. Build speech2xvector + speech2xvector_kwargs = dict( + sv_train_config=sv_train_config, + sv_model_file=sv_model_file, + device=device, + dtype=dtype, + streaming=streaming, + embedding_node=embedding_node + ) + logging.info("speech2xvector_kwargs: {}".format(speech2xvector_kwargs)) + speech2xvector = Speech2Xvector.from_pretrained( + model_tag=model_tag, + **speech2xvector_kwargs, + ) + speech2xvector.sv_model.eval() + + def _forward( + data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None, + raw_inputs: Union[np.ndarray, torch.Tensor] = None, + output_dir_v2: Optional[str] = None, + param_dict: Optional[dict] = None, + ): + logging.info("param_dict: {}".format(param_dict)) + if data_path_and_name_and_type is None and raw_inputs is not None: + if isinstance(raw_inputs, torch.Tensor): + raw_inputs = raw_inputs.numpy() + data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] + + # 3. Build data-iterator + loader = ASRTask.build_streaming_iterator( + data_path_and_name_and_type, + dtype=dtype, + batch_size=batch_size, + key_file=key_file, + num_workers=num_workers, + preprocess_fn=None, + collate_fn=None, + allow_variable_data_keys=allow_variable_data_keys, + inference=True, + ) + + # 7 .Start for-loop + output_path = output_dir_v2 if output_dir_v2 is not None else output_dir + embd_writer, ref_embd_writer, score_writer = None, None, None + if output_path is not None: + os.makedirs(output_path, exist_ok=True) + embd_writer = WriteHelper("ark,scp:{}/xvector.ark,{}/xvector.scp".format(output_path, output_path)) + sv_result_list = [] + for keys, batch in loader: + assert isinstance(batch, dict), type(batch) + assert all(isinstance(s, str) for s in keys), keys + _bs = len(next(iter(batch.values()))) + assert len(keys) == _bs, f"{len(keys)} != {_bs}" + batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} + + embedding, ref_embedding, score = speech2xvector(**batch) + # Only supporting batch_size==1 + key = keys[0] + normalized_score = 0.0 + if score is not None: + score = score.item() + normalized_score = max(score - sv_threshold, 0.0) / (1.0 - sv_threshold) * 100.0 + item = {"key": key, "value": normalized_score} + else: + item = {"key": key, "value": embedding.squeeze(0).cpu().numpy()} + sv_result_list.append(item) + if output_path is not None: + embd_writer(key, embedding[0].cpu().numpy()) + if ref_embedding is not None: + if ref_embd_writer is None: + ref_embd_writer = WriteHelper( + "ark,scp:{}/ref_xvector.ark,{}/ref_xvector.scp".format(output_path, output_path) + ) + score_writer = open(os.path.join(output_path, "score.txt"), "w") + ref_embd_writer(key, ref_embedding[0].cpu().numpy()) + score_writer.write("{} {:.6f}\n".format(key, normalized_score)) + + if output_path is not None: + embd_writer.close() + if ref_embd_writer is not None: + ref_embd_writer.close() + score_writer.close() + + return sv_result_list + + return _forward def get_parser(): @@ -133,8 +291,7 @@ def get_parser(): def inference_launch(mode, **kwargs): if mode == "sv": - from funasr.bin.sv_inference import inference_modelscope - return inference_modelscope(**kwargs) + return inference_sv(**kwargs) else: logging.info("Unknown decoding mode: {}".format(mode)) return None @@ -167,7 +324,8 @@ def main(cmd=None): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = gpuid - inference_launch(**kwargs) + inference_pipeline = inference_launch(**kwargs) + return inference_pipeline(kwargs["data_path_and_name_and_type"]) if __name__ == "__main__": diff --git a/funasr/bin/tp_infer.py b/funasr/bin/tp_infer.py new file mode 100644 index 000000000..c83ceeaa4 --- /dev/null +++ b/funasr/bin/tp_infer.py @@ -0,0 +1,115 @@ +import argparse +import logging +from optparse import Option +import sys +import json +from pathlib import Path +from typing import Any +from typing import List +from typing import Optional +from typing import Sequence +from typing import Tuple +from typing import Union +from typing import Dict + +import numpy as np +import torch +from typeguard import check_argument_types + +from funasr.fileio.datadir_writer import DatadirWriter +from funasr.datasets.preprocessor import LMPreprocessor +from funasr.tasks.asr import ASRTaskAligner as ASRTask +from funasr.torch_utils.device_funcs import to_device +from funasr.torch_utils.set_all_random_seed import set_all_random_seed +from funasr.utils import config_argparse +from funasr.utils.cli_utils import get_commandline_args +from funasr.utils.types import str2bool +from funasr.utils.types import str2triple_str +from funasr.utils.types import str_or_none +from funasr.models.frontend.wav_frontend import WavFrontend +from funasr.text.token_id_converter import TokenIDConverter +from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard + + + + +class Speech2Timestamp: + def __init__( + self, + timestamp_infer_config: Union[Path, str] = None, + timestamp_model_file: Union[Path, str] = None, + timestamp_cmvn_file: Union[Path, str] = None, + device: str = "cpu", + dtype: str = "float32", + **kwargs, + ): + assert check_argument_types() + # 1. Build ASR model + tp_model, tp_train_args = ASRTask.build_model_from_file( + timestamp_infer_config, timestamp_model_file, device=device + ) + if 'cuda' in device: + tp_model = tp_model.cuda() # force model to cuda + + frontend = None + if tp_train_args.frontend is not None: + frontend = WavFrontend(cmvn_file=timestamp_cmvn_file, **tp_train_args.frontend_conf) + + logging.info("tp_model: {}".format(tp_model)) + logging.info("tp_train_args: {}".format(tp_train_args)) + tp_model.to(dtype=getattr(torch, dtype)).eval() + + logging.info(f"Decoding device={device}, dtype={dtype}") + + + self.tp_model = tp_model + self.tp_train_args = tp_train_args + + token_list = self.tp_model.token_list + self.converter = TokenIDConverter(token_list=token_list) + + self.device = device + self.dtype = dtype + self.frontend = frontend + self.encoder_downsampling_factor = 1 + if tp_train_args.encoder_conf["input_layer"] == "conv2d": + self.encoder_downsampling_factor = 4 + + @torch.no_grad() + def __call__( + self, + speech: Union[torch.Tensor, np.ndarray], + speech_lengths: Union[torch.Tensor, np.ndarray] = None, + text_lengths: Union[torch.Tensor, np.ndarray] = None + ): + assert check_argument_types() + + # Input as audio signal + if isinstance(speech, np.ndarray): + speech = torch.tensor(speech) + if self.frontend is not None: + feats, feats_len = self.frontend.forward(speech, speech_lengths) + feats = to_device(feats, device=self.device) + feats_len = feats_len.int() + self.tp_model.frontend = None + else: + feats = speech + feats_len = speech_lengths + + # lfr_factor = max(1, (feats.size()[-1]//80)-1) + batch = {"speech": feats, "speech_lengths": feats_len} + + # a. To device + batch = to_device(batch, device=self.device) + + # b. Forward Encoder + enc, enc_len = self.tp_model.encode(**batch) + if isinstance(enc, tuple): + enc = enc[0] + + # c. Forward Predictor + _, _, us_alphas, us_peaks = self.tp_model.calc_predictor_timestamp(enc, enc_len, text_lengths.to(self.device)+1) + return us_alphas, us_peaks + + + diff --git a/funasr/bin/tp_inference.py b/funasr/bin/tp_inference.py deleted file mode 100644 index 6e513c5a0..000000000 --- a/funasr/bin/tp_inference.py +++ /dev/null @@ -1,399 +0,0 @@ -import argparse -import logging -from optparse import Option -import sys -import json -from pathlib import Path -from typing import Any -from typing import List -from typing import Optional -from typing import Sequence -from typing import Tuple -from typing import Union -from typing import Dict - -import numpy as np -import torch -from typeguard import check_argument_types - -from funasr.fileio.datadir_writer import DatadirWriter -from funasr.datasets.preprocessor import LMPreprocessor -from funasr.tasks.asr import ASRTaskAligner as ASRTask -from funasr.torch_utils.device_funcs import to_device -from funasr.torch_utils.set_all_random_seed import set_all_random_seed -from funasr.utils import config_argparse -from funasr.utils.cli_utils import get_commandline_args -from funasr.utils.types import str2bool -from funasr.utils.types import str2triple_str -from funasr.utils.types import str_or_none -from funasr.models.frontend.wav_frontend import WavFrontend -from funasr.text.token_id_converter import TokenIDConverter -from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard - - -header_colors = '\033[95m' -end_colors = '\033[0m' - -global_asr_language: str = 'zh-cn' -global_sample_rate: Union[int, Dict[Any, int]] = { - 'audio_fs': 16000, - 'model_fs': 16000 -} - - -class SpeechText2Timestamp: - def __init__( - self, - timestamp_infer_config: Union[Path, str] = None, - timestamp_model_file: Union[Path, str] = None, - timestamp_cmvn_file: Union[Path, str] = None, - device: str = "cpu", - dtype: str = "float32", - **kwargs, - ): - assert check_argument_types() - # 1. Build ASR model - tp_model, tp_train_args = ASRTask.build_model_from_file( - timestamp_infer_config, timestamp_model_file, device=device - ) - if 'cuda' in device: - tp_model = tp_model.cuda() # force model to cuda - - frontend = None - if tp_train_args.frontend is not None: - frontend = WavFrontend(cmvn_file=timestamp_cmvn_file, **tp_train_args.frontend_conf) - - logging.info("tp_model: {}".format(tp_model)) - logging.info("tp_train_args: {}".format(tp_train_args)) - tp_model.to(dtype=getattr(torch, dtype)).eval() - - logging.info(f"Decoding device={device}, dtype={dtype}") - - - self.tp_model = tp_model - self.tp_train_args = tp_train_args - - token_list = self.tp_model.token_list - self.converter = TokenIDConverter(token_list=token_list) - - self.device = device - self.dtype = dtype - self.frontend = frontend - self.encoder_downsampling_factor = 1 - if tp_train_args.encoder_conf["input_layer"] == "conv2d": - self.encoder_downsampling_factor = 4 - - @torch.no_grad() - def __call__( - self, - speech: Union[torch.Tensor, np.ndarray], - speech_lengths: Union[torch.Tensor, np.ndarray] = None, - text_lengths: Union[torch.Tensor, np.ndarray] = None - ): - assert check_argument_types() - - # Input as audio signal - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - if self.frontend is not None: - feats, feats_len = self.frontend.forward(speech, speech_lengths) - feats = to_device(feats, device=self.device) - feats_len = feats_len.int() - self.tp_model.frontend = None - else: - feats = speech - feats_len = speech_lengths - - # lfr_factor = max(1, (feats.size()[-1]//80)-1) - batch = {"speech": feats, "speech_lengths": feats_len} - - # a. To device - batch = to_device(batch, device=self.device) - - # b. Forward Encoder - enc, enc_len = self.tp_model.encode(**batch) - if isinstance(enc, tuple): - enc = enc[0] - - # c. Forward Predictor - _, _, us_alphas, us_peaks = self.tp_model.calc_predictor_timestamp(enc, enc_len, text_lengths.to(self.device)+1) - return us_alphas, us_peaks - - -def inference( - batch_size: int, - ngpu: int, - log_level: Union[int, str], - data_path_and_name_and_type, - timestamp_infer_config: Optional[str], - timestamp_model_file: Optional[str], - timestamp_cmvn_file: Optional[str] = None, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - key_file: Optional[str] = None, - allow_variable_data_keys: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - num_workers: int = 1, - split_with_space: bool = True, - seg_dict_file: Optional[str] = None, - **kwargs, -): - inference_pipeline = inference_modelscope( - batch_size=batch_size, - ngpu=ngpu, - log_level=log_level, - timestamp_infer_config=timestamp_infer_config, - timestamp_model_file=timestamp_model_file, - timestamp_cmvn_file=timestamp_cmvn_file, - key_file=key_file, - allow_variable_data_keys=allow_variable_data_keys, - output_dir=output_dir, - dtype=dtype, - seed=seed, - num_workers=num_workers, - split_with_space=split_with_space, - seg_dict_file=seg_dict_file, - **kwargs, - ) - return inference_pipeline(data_path_and_name_and_type, raw_inputs) - - -def inference_modelscope( - batch_size: int, - ngpu: int, - log_level: Union[int, str], - # data_path_and_name_and_type, - timestamp_infer_config: Optional[str], - timestamp_model_file: Optional[str], - timestamp_cmvn_file: Optional[str] = None, - # raw_inputs: Union[np.ndarray, torch.Tensor] = None, - key_file: Optional[str] = None, - allow_variable_data_keys: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - num_workers: int = 1, - split_with_space: bool = True, - seg_dict_file: Optional[str] = None, - **kwargs, -): - assert check_argument_types() - ncpu = kwargs.get("ncpu", 1) - torch.set_num_threads(ncpu) - - if batch_size > 1: - raise NotImplementedError("batch decoding is not implemented") - if ngpu > 1: - raise NotImplementedError("only single GPU decoding is supported") - - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - # 1. Set random-seed - set_all_random_seed(seed) - - # 2. Build speech2vadsegment - speechtext2timestamp_kwargs = dict( - timestamp_infer_config=timestamp_infer_config, - timestamp_model_file=timestamp_model_file, - timestamp_cmvn_file=timestamp_cmvn_file, - device=device, - dtype=dtype, - ) - logging.info("speechtext2timestamp_kwargs: {}".format(speechtext2timestamp_kwargs)) - speechtext2timestamp = SpeechText2Timestamp(**speechtext2timestamp_kwargs) - - preprocessor = LMPreprocessor( - train=False, - token_type=speechtext2timestamp.tp_train_args.token_type, - token_list=speechtext2timestamp.tp_train_args.token_list, - bpemodel=None, - text_cleaner=None, - g2p_type=None, - text_name="text", - non_linguistic_symbols=speechtext2timestamp.tp_train_args.non_linguistic_symbols, - split_with_space=split_with_space, - seg_dict_file=seg_dict_file, - ) - - if output_dir is not None: - writer = DatadirWriter(output_dir) - tp_writer = writer[f"timestamp_prediction"] - # ibest_writer["token_list"][""] = " ".join(speech2text.asr_train_args.token_list) - else: - tp_writer = None - - def _forward( - data_path_and_name_and_type, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - output_dir_v2: Optional[str] = None, - fs: dict = None, - param_dict: dict = None, - **kwargs - ): - output_path = output_dir_v2 if output_dir_v2 is not None else output_dir - writer = None - if output_path is not None: - writer = DatadirWriter(output_path) - tp_writer = writer[f"timestamp_prediction"] - else: - tp_writer = None - # 3. Build data-iterator - if data_path_and_name_and_type is None and raw_inputs is not None: - if isinstance(raw_inputs, torch.Tensor): - raw_inputs = raw_inputs.numpy() - data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] - - loader = ASRTask.build_streaming_iterator( - data_path_and_name_and_type, - dtype=dtype, - batch_size=batch_size, - key_file=key_file, - num_workers=num_workers, - preprocess_fn=preprocessor, - collate_fn=ASRTask.build_collate_fn(speechtext2timestamp.tp_train_args, False), - allow_variable_data_keys=allow_variable_data_keys, - inference=True, - ) - - tp_result_list = [] - for keys, batch in loader: - assert isinstance(batch, dict), type(batch) - assert all(isinstance(s, str) for s in keys), keys - _bs = len(next(iter(batch.values()))) - assert len(keys) == _bs, f"{len(keys)} != {_bs}" - - logging.info("timestamp predicting, utt_id: {}".format(keys)) - _batch = {'speech':batch['speech'], - 'speech_lengths':batch['speech_lengths'], - 'text_lengths':batch['text_lengths']} - us_alphas, us_cif_peak = speechtext2timestamp(**_batch) - - for batch_id in range(_bs): - key = keys[batch_id] - token = speechtext2timestamp.converter.ids2tokens(batch['text'][batch_id]) - ts_str, ts_list = ts_prediction_lfr6_standard(us_alphas[batch_id], us_cif_peak[batch_id], token, force_time_shift=-3.0) - logging.warning(ts_str) - item = {'key': key, 'value': ts_str, 'timestamp':ts_list} - if tp_writer is not None: - tp_writer["tp_sync"][key+'#'] = ts_str - tp_writer["tp_time"][key+'#'] = str(ts_list) - tp_result_list.append(item) - return tp_result_list - - return _forward - - -def get_parser(): - parser = config_argparse.ArgumentParser( - description="Timestamp Prediction Inference", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Note(kamo): Use '_' instead of '-' as separator. - # '-' is confusing if written in yaml. - parser.add_argument( - "--log_level", - type=lambda x: x.upper(), - default="INFO", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), - help="The verbose level of logging", - ) - - parser.add_argument("--output_dir", type=str, required=False) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - parser.add_argument( - "--gpuid_list", - type=str, - default="", - help="The visible gpus", - ) - parser.add_argument("--seed", type=int, default=0, help="Random seed") - parser.add_argument( - "--dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type", - ) - parser.add_argument( - "--num_workers", - type=int, - default=0, - help="The number of workers used for DataLoader", - ) - - group = parser.add_argument_group("Input data related") - group.add_argument( - "--data_path_and_name_and_type", - type=str2triple_str, - required=False, - action="append", - ) - group.add_argument("--raw_inputs", type=list, default=None) - # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}]) - group.add_argument("--key_file", type=str_or_none) - group.add_argument("--allow_variable_data_keys", type=str2bool, default=False) - - group = parser.add_argument_group("The model configuration related") - group.add_argument( - "--timestamp_infer_config", - type=str, - help="VAD infer configuration", - ) - group.add_argument( - "--timestamp_model_file", - type=str, - help="VAD model parameter file", - ) - group.add_argument( - "--timestamp_cmvn_file", - type=str, - help="Global cmvn file", - ) - - group = parser.add_argument_group("infer related") - group.add_argument( - "--batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - group.add_argument( - "--seg_dict_file", - type=str, - default=None, - help="The batch size for inference", - ) - group.add_argument( - "--split_with_space", - type=bool, - default=False, - help="The batch size for inference", - ) - - return parser - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - parser = get_parser() - args = parser.parse_args(cmd) - kwargs = vars(args) - kwargs.pop("config", None) - inference(**kwargs) - - -if __name__ == "__main__": - main() diff --git a/funasr/bin/tp_inference_launch.py b/funasr/bin/tp_inference_launch.py index 6cdff057d..2b2b2aebf 100644 --- a/funasr/bin/tp_inference_launch.py +++ b/funasr/bin/tp_inference_launch.py @@ -13,6 +13,171 @@ from funasr.utils.types import str2bool from funasr.utils.types import str2triple_str from funasr.utils.types import str_or_none +import argparse +import logging +from optparse import Option +import sys +import json +from pathlib import Path +from typing import Any +from typing import List +from typing import Optional +from typing import Sequence +from typing import Tuple +from typing import Union +from typing import Dict + +import numpy as np +import torch +from typeguard import check_argument_types + +from funasr.fileio.datadir_writer import DatadirWriter +from funasr.datasets.preprocessor import LMPreprocessor +from funasr.tasks.asr import ASRTaskAligner as ASRTask +from funasr.torch_utils.device_funcs import to_device +from funasr.torch_utils.set_all_random_seed import set_all_random_seed +from funasr.utils import config_argparse +from funasr.utils.cli_utils import get_commandline_args +from funasr.utils.types import str2bool +from funasr.utils.types import str2triple_str +from funasr.utils.types import str_or_none +from funasr.models.frontend.wav_frontend import WavFrontend +from funasr.text.token_id_converter import TokenIDConverter +from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard +from funasr.bin.tp_infer import Speech2Timestamp + +def inference_tp( + batch_size: int, + ngpu: int, + log_level: Union[int, str], + # data_path_and_name_and_type, + timestamp_infer_config: Optional[str], + timestamp_model_file: Optional[str], + timestamp_cmvn_file: Optional[str] = None, + # raw_inputs: Union[np.ndarray, torch.Tensor] = None, + key_file: Optional[str] = None, + allow_variable_data_keys: bool = False, + output_dir: Optional[str] = None, + dtype: str = "float32", + seed: int = 0, + num_workers: int = 1, + split_with_space: bool = True, + seg_dict_file: Optional[str] = None, + **kwargs, +): + assert check_argument_types() + ncpu = kwargs.get("ncpu", 1) + torch.set_num_threads(ncpu) + + if batch_size > 1: + raise NotImplementedError("batch decoding is not implemented") + if ngpu > 1: + raise NotImplementedError("only single GPU decoding is supported") + + logging.basicConfig( + level=log_level, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + + if ngpu >= 1 and torch.cuda.is_available(): + device = "cuda" + else: + device = "cpu" + # 1. Set random-seed + set_all_random_seed(seed) + + # 2. Build speech2vadsegment + speechtext2timestamp_kwargs = dict( + timestamp_infer_config=timestamp_infer_config, + timestamp_model_file=timestamp_model_file, + timestamp_cmvn_file=timestamp_cmvn_file, + device=device, + dtype=dtype, + ) + logging.info("speechtext2timestamp_kwargs: {}".format(speechtext2timestamp_kwargs)) + speechtext2timestamp = Speech2Timestamp(**speechtext2timestamp_kwargs) + + preprocessor = LMPreprocessor( + train=False, + token_type=speechtext2timestamp.tp_train_args.token_type, + token_list=speechtext2timestamp.tp_train_args.token_list, + bpemodel=None, + text_cleaner=None, + g2p_type=None, + text_name="text", + non_linguistic_symbols=speechtext2timestamp.tp_train_args.non_linguistic_symbols, + split_with_space=split_with_space, + seg_dict_file=seg_dict_file, + ) + + if output_dir is not None: + writer = DatadirWriter(output_dir) + tp_writer = writer[f"timestamp_prediction"] + # ibest_writer["token_list"][""] = " ".join(speech2text.asr_train_args.token_list) + else: + tp_writer = None + + def _forward( + data_path_and_name_and_type, + raw_inputs: Union[np.ndarray, torch.Tensor] = None, + output_dir_v2: Optional[str] = None, + fs: dict = None, + param_dict: dict = None, + **kwargs + ): + output_path = output_dir_v2 if output_dir_v2 is not None else output_dir + writer = None + if output_path is not None: + writer = DatadirWriter(output_path) + tp_writer = writer[f"timestamp_prediction"] + else: + tp_writer = None + # 3. Build data-iterator + if data_path_and_name_and_type is None and raw_inputs is not None: + if isinstance(raw_inputs, torch.Tensor): + raw_inputs = raw_inputs.numpy() + data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] + + loader = ASRTask.build_streaming_iterator( + data_path_and_name_and_type, + dtype=dtype, + batch_size=batch_size, + key_file=key_file, + num_workers=num_workers, + preprocess_fn=preprocessor, + collate_fn=ASRTask.build_collate_fn(speechtext2timestamp.tp_train_args, False), + allow_variable_data_keys=allow_variable_data_keys, + inference=True, + ) + + tp_result_list = [] + for keys, batch in loader: + assert isinstance(batch, dict), type(batch) + assert all(isinstance(s, str) for s in keys), keys + _bs = len(next(iter(batch.values()))) + assert len(keys) == _bs, f"{len(keys)} != {_bs}" + + logging.info("timestamp predicting, utt_id: {}".format(keys)) + _batch = {'speech': batch['speech'], + 'speech_lengths': batch['speech_lengths'], + 'text_lengths': batch['text_lengths']} + us_alphas, us_cif_peak = speechtext2timestamp(**_batch) + + for batch_id in range(_bs): + key = keys[batch_id] + token = speechtext2timestamp.converter.ids2tokens(batch['text'][batch_id]) + ts_str, ts_list = ts_prediction_lfr6_standard(us_alphas[batch_id], us_cif_peak[batch_id], token, + force_time_shift=-3.0) + logging.warning(ts_str) + item = {'key': key, 'value': ts_str, 'timestamp': ts_list} + if tp_writer is not None: + tp_writer["tp_sync"][key + '#'] = ts_str + tp_writer["tp_time"][key + '#'] = str(ts_list) + tp_result_list.append(item) + return tp_result_list + + return _forward + def get_parser(): parser = config_argparse.ArgumentParser( @@ -102,8 +267,7 @@ def get_parser(): def inference_launch(mode, **kwargs): if mode == "tp_norm": - from funasr.bin.tp_inference import inference_modelscope - return inference_modelscope(**kwargs) + return inference_tp(**kwargs) else: logging.info("Unknown decoding mode: {}".format(mode)) return None @@ -135,7 +299,9 @@ def main(cmd=None): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = gpuid - inference_launch(**kwargs) + inference_pipeline = inference_launch(**kwargs) + return inference_pipeline(kwargs["data_path_and_name_and_type"]) + if __name__ == "__main__": diff --git a/funasr/bin/vad_infer.py b/funasr/bin/vad_infer.py new file mode 100644 index 000000000..5835e77df --- /dev/null +++ b/funasr/bin/vad_infer.py @@ -0,0 +1,196 @@ +import argparse +import logging +import os +import sys +import json +from pathlib import Path +from typing import Any +from typing import List +from typing import Optional +from typing import Sequence +from typing import Tuple +from typing import Union +from typing import Dict + +import math +import numpy as np +import torch +from typeguard import check_argument_types +from typeguard import check_return_type + +from funasr.fileio.datadir_writer import DatadirWriter +from funasr.modules.scorers.scorer_interface import BatchScorerInterface +from funasr.modules.subsampling import TooShortUttError +from funasr.tasks.vad import VADTask +from funasr.torch_utils.device_funcs import to_device +from funasr.torch_utils.set_all_random_seed import set_all_random_seed +from funasr.utils import config_argparse +from funasr.utils.cli_utils import get_commandline_args +from funasr.utils.types import str2bool +from funasr.utils.types import str2triple_str +from funasr.utils.types import str_or_none +from funasr.utils import asr_utils, wav_utils, postprocess_utils +from funasr.models.frontend.wav_frontend import WavFrontend, WavFrontendOnline + + + +class Speech2VadSegment: + """Speech2VadSegment class + + Examples: + >>> import soundfile + >>> speech2segment = Speech2VadSegment("vad_config.yml", "vad.pt") + >>> audio, rate = soundfile.read("speech.wav") + >>> speech2segment(audio) + [[10, 230], [245, 450], ...] + + """ + + def __init__( + self, + vad_infer_config: Union[Path, str] = None, + vad_model_file: Union[Path, str] = None, + vad_cmvn_file: Union[Path, str] = None, + device: str = "cpu", + batch_size: int = 1, + dtype: str = "float32", + **kwargs, + ): + assert check_argument_types() + + # 1. Build vad model + vad_model, vad_infer_args = VADTask.build_model_from_file( + vad_infer_config, vad_model_file, device + ) + frontend = None + if vad_infer_args.frontend is not None: + frontend = WavFrontend(cmvn_file=vad_cmvn_file, **vad_infer_args.frontend_conf) + + logging.info("vad_model: {}".format(vad_model)) + logging.info("vad_infer_args: {}".format(vad_infer_args)) + vad_model.to(dtype=getattr(torch, dtype)).eval() + + self.vad_model = vad_model + self.vad_infer_args = vad_infer_args + self.device = device + self.dtype = dtype + self.frontend = frontend + self.batch_size = batch_size + + @torch.no_grad() + def __call__( + self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None, + in_cache: Dict[str, torch.Tensor] = dict() + ) -> Tuple[List[List[int]], Dict[str, torch.Tensor]]: + """Inference + + Args: + speech: Input speech data + Returns: + text, token, token_int, hyp + + """ + assert check_argument_types() + + # Input as audio signal + if isinstance(speech, np.ndarray): + speech = torch.tensor(speech) + + if self.frontend is not None: + self.frontend.filter_length_max = math.inf + fbanks, fbanks_len = self.frontend.forward_fbank(speech, speech_lengths) + feats, feats_len = self.frontend.forward_lfr_cmvn(fbanks, fbanks_len) + fbanks = to_device(fbanks, device=self.device) + feats = to_device(feats, device=self.device) + feats_len = feats_len.int() + else: + raise Exception("Need to extract feats first, please configure frontend configuration") + + # b. Forward Encoder streaming + t_offset = 0 + step = min(feats_len.max(), 6000) + segments = [[]] * self.batch_size + for t_offset in range(0, feats_len, min(step, feats_len - t_offset)): + if t_offset + step >= feats_len - 1: + step = feats_len - t_offset + is_final = True + else: + is_final = False + batch = { + "feats": feats[:, t_offset:t_offset + step, :], + "waveform": speech[:, t_offset * 160:min(speech.shape[-1], (t_offset + step - 1) * 160 + 400)], + "is_final": is_final, + "in_cache": in_cache + } + # a. To device + #batch = to_device(batch, device=self.device) + segments_part, in_cache = self.vad_model(**batch) + if segments_part: + for batch_num in range(0, self.batch_size): + segments[batch_num] += segments_part[batch_num] + return fbanks, segments + +class Speech2VadSegmentOnline(Speech2VadSegment): + """Speech2VadSegmentOnline class + + Examples: + >>> import soundfile + >>> speech2segment = Speech2VadSegmentOnline("vad_config.yml", "vad.pt") + >>> audio, rate = soundfile.read("speech.wav") + >>> speech2segment(audio) + [[10, 230], [245, 450], ...] + + """ + def __init__(self, **kwargs): + super(Speech2VadSegmentOnline, self).__init__(**kwargs) + vad_cmvn_file = kwargs.get('vad_cmvn_file', None) + self.frontend = None + if self.vad_infer_args.frontend is not None: + self.frontend = WavFrontendOnline(cmvn_file=vad_cmvn_file, **self.vad_infer_args.frontend_conf) + + + @torch.no_grad() + def __call__( + self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None, + in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False, max_end_sil: int = 800 + ) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]: + """Inference + + Args: + speech: Input speech data + Returns: + text, token, token_int, hyp + + """ + assert check_argument_types() + + # Input as audio signal + if isinstance(speech, np.ndarray): + speech = torch.tensor(speech) + batch_size = speech.shape[0] + segments = [[]] * batch_size + if self.frontend is not None: + feats, feats_len = self.frontend.forward(speech, speech_lengths, is_final) + fbanks, _ = self.frontend.get_fbank() + else: + raise Exception("Need to extract feats first, please configure frontend configuration") + if feats.shape[0]: + feats = to_device(feats, device=self.device) + feats_len = feats_len.int() + waveforms = self.frontend.get_waveforms() + + batch = { + "feats": feats, + "waveform": waveforms, + "in_cache": in_cache, + "is_final": is_final, + "max_end_sil": max_end_sil + } + # a. To device + batch = to_device(batch, device=self.device) + segments, in_cache = self.vad_model.forward_online(**batch) + # in_cache.update(batch['in_cache']) + # in_cache = {key: value for key, value in batch['in_cache'].items()} + return fbanks, segments, in_cache + + diff --git a/funasr/bin/vad_inference.py b/funasr/bin/vad_inference.py deleted file mode 100644 index 5fbd8449a..000000000 --- a/funasr/bin/vad_inference.py +++ /dev/null @@ -1,570 +0,0 @@ -import argparse -import logging -import os -import sys -import json -from pathlib import Path -from typing import Any -from typing import List -from typing import Optional -from typing import Sequence -from typing import Tuple -from typing import Union -from typing import Dict - -import math -import numpy as np -import torch -from typeguard import check_argument_types -from typeguard import check_return_type - -from funasr.fileio.datadir_writer import DatadirWriter -from funasr.modules.scorers.scorer_interface import BatchScorerInterface -from funasr.modules.subsampling import TooShortUttError -from funasr.tasks.vad import VADTask -from funasr.torch_utils.device_funcs import to_device -from funasr.torch_utils.set_all_random_seed import set_all_random_seed -from funasr.utils import config_argparse -from funasr.utils.cli_utils import get_commandline_args -from funasr.utils.types import str2bool -from funasr.utils.types import str2triple_str -from funasr.utils.types import str_or_none -from funasr.utils import asr_utils, wav_utils, postprocess_utils -from funasr.models.frontend.wav_frontend import WavFrontend, WavFrontendOnline - -header_colors = '\033[95m' -end_colors = '\033[0m' - -global_asr_language: str = 'zh-cn' -global_sample_rate: Union[int, Dict[Any, int]] = { - 'audio_fs': 16000, - 'model_fs': 16000 -} - - -class Speech2VadSegment: - """Speech2VadSegment class - - Examples: - >>> import soundfile - >>> speech2segment = Speech2VadSegment("vad_config.yml", "vad.pt") - >>> audio, rate = soundfile.read("speech.wav") - >>> speech2segment(audio) - [[10, 230], [245, 450], ...] - - """ - - def __init__( - self, - vad_infer_config: Union[Path, str] = None, - vad_model_file: Union[Path, str] = None, - vad_cmvn_file: Union[Path, str] = None, - device: str = "cpu", - batch_size: int = 1, - dtype: str = "float32", - **kwargs, - ): - assert check_argument_types() - - # 1. Build vad model - vad_model, vad_infer_args = VADTask.build_model_from_file( - vad_infer_config, vad_model_file, device - ) - frontend = None - if vad_infer_args.frontend is not None: - frontend = WavFrontend(cmvn_file=vad_cmvn_file, **vad_infer_args.frontend_conf) - - logging.info("vad_model: {}".format(vad_model)) - logging.info("vad_infer_args: {}".format(vad_infer_args)) - vad_model.to(dtype=getattr(torch, dtype)).eval() - - self.vad_model = vad_model - self.vad_infer_args = vad_infer_args - self.device = device - self.dtype = dtype - self.frontend = frontend - self.batch_size = batch_size - - @torch.no_grad() - def __call__( - self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None, - in_cache: Dict[str, torch.Tensor] = dict() - ) -> Tuple[List[List[int]], Dict[str, torch.Tensor]]: - """Inference - - Args: - speech: Input speech data - Returns: - text, token, token_int, hyp - - """ - assert check_argument_types() - - # Input as audio signal - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - - if self.frontend is not None: - self.frontend.filter_length_max = math.inf - fbanks, fbanks_len = self.frontend.forward_fbank(speech, speech_lengths) - feats, feats_len = self.frontend.forward_lfr_cmvn(fbanks, fbanks_len) - fbanks = to_device(fbanks, device=self.device) - feats = to_device(feats, device=self.device) - feats_len = feats_len.int() - else: - raise Exception("Need to extract feats first, please configure frontend configuration") - - # b. Forward Encoder streaming - t_offset = 0 - step = min(feats_len.max(), 6000) - segments = [[]] * self.batch_size - for t_offset in range(0, feats_len, min(step, feats_len - t_offset)): - if t_offset + step >= feats_len - 1: - step = feats_len - t_offset - is_final = True - else: - is_final = False - batch = { - "feats": feats[:, t_offset:t_offset + step, :], - "waveform": speech[:, t_offset * 160:min(speech.shape[-1], (t_offset + step - 1) * 160 + 400)], - "is_final": is_final, - "in_cache": in_cache - } - # a. To device - #batch = to_device(batch, device=self.device) - segments_part, in_cache = self.vad_model(**batch) - if segments_part: - for batch_num in range(0, self.batch_size): - segments[batch_num] += segments_part[batch_num] - return fbanks, segments - -class Speech2VadSegmentOnline(Speech2VadSegment): - """Speech2VadSegmentOnline class - - Examples: - >>> import soundfile - >>> speech2segment = Speech2VadSegmentOnline("vad_config.yml", "vad.pt") - >>> audio, rate = soundfile.read("speech.wav") - >>> speech2segment(audio) - [[10, 230], [245, 450], ...] - - """ - def __init__(self, **kwargs): - super(Speech2VadSegmentOnline, self).__init__(**kwargs) - vad_cmvn_file = kwargs.get('vad_cmvn_file', None) - self.frontend = None - if self.vad_infer_args.frontend is not None: - self.frontend = WavFrontendOnline(cmvn_file=vad_cmvn_file, **self.vad_infer_args.frontend_conf) - - - @torch.no_grad() - def __call__( - self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None, - in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False, max_end_sil: int = 800 - ) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]: - """Inference - - Args: - speech: Input speech data - Returns: - text, token, token_int, hyp - - """ - assert check_argument_types() - - # Input as audio signal - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - batch_size = speech.shape[0] - segments = [[]] * batch_size - if self.frontend is not None: - feats, feats_len = self.frontend.forward(speech, speech_lengths, is_final) - fbanks, _ = self.frontend.get_fbank() - else: - raise Exception("Need to extract feats first, please configure frontend configuration") - if feats.shape[0]: - feats = to_device(feats, device=self.device) - feats_len = feats_len.int() - waveforms = self.frontend.get_waveforms() - - batch = { - "feats": feats, - "waveform": waveforms, - "in_cache": in_cache, - "is_final": is_final, - "max_end_sil": max_end_sil - } - # a. To device - batch = to_device(batch, device=self.device) - segments, in_cache = self.vad_model.forward_online(**batch) - # in_cache.update(batch['in_cache']) - # in_cache = {key: value for key, value in batch['in_cache'].items()} - return fbanks, segments, in_cache - - -def inference( - batch_size: int, - ngpu: int, - log_level: Union[int, str], - data_path_and_name_and_type, - vad_infer_config: Optional[str], - vad_model_file: Optional[str], - vad_cmvn_file: Optional[str] = None, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - key_file: Optional[str] = None, - allow_variable_data_keys: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - num_workers: int = 1, - online: bool = False, - **kwargs, -): - if not online: - inference_pipeline = inference_modelscope( - batch_size=batch_size, - ngpu=ngpu, - log_level=log_level, - vad_infer_config=vad_infer_config, - vad_model_file=vad_model_file, - vad_cmvn_file=vad_cmvn_file, - key_file=key_file, - allow_variable_data_keys=allow_variable_data_keys, - output_dir=output_dir, - dtype=dtype, - seed=seed, - num_workers=num_workers, - **kwargs, - ) - else: - inference_pipeline = inference_modelscope_online( - batch_size=batch_size, - ngpu=ngpu, - log_level=log_level, - vad_infer_config=vad_infer_config, - vad_model_file=vad_model_file, - vad_cmvn_file=vad_cmvn_file, - key_file=key_file, - allow_variable_data_keys=allow_variable_data_keys, - output_dir=output_dir, - dtype=dtype, - seed=seed, - num_workers=num_workers, - **kwargs, - ) - return inference_pipeline(data_path_and_name_and_type, raw_inputs) - -def inference_modelscope( - batch_size: int, - ngpu: int, - log_level: Union[int, str], - # data_path_and_name_and_type, - vad_infer_config: Optional[str], - vad_model_file: Optional[str], - vad_cmvn_file: Optional[str] = None, - # raw_inputs: Union[np.ndarray, torch.Tensor] = None, - key_file: Optional[str] = None, - allow_variable_data_keys: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - num_workers: int = 1, - **kwargs, -): - assert check_argument_types() - if batch_size > 1: - raise NotImplementedError("batch decoding is not implemented") - - - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - batch_size = 1 - # 1. Set random-seed - set_all_random_seed(seed) - - # 2. Build speech2vadsegment - speech2vadsegment_kwargs = dict( - vad_infer_config=vad_infer_config, - vad_model_file=vad_model_file, - vad_cmvn_file=vad_cmvn_file, - device=device, - dtype=dtype, - ) - logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs)) - speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs) - - def _forward( - data_path_and_name_and_type, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - output_dir_v2: Optional[str] = None, - fs: dict = None, - param_dict: dict = None - ): - # 3. Build data-iterator - if data_path_and_name_and_type is None and raw_inputs is not None: - if isinstance(raw_inputs, torch.Tensor): - raw_inputs = raw_inputs.numpy() - data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] - loader = VADTask.build_streaming_iterator( - data_path_and_name_and_type, - dtype=dtype, - batch_size=batch_size, - key_file=key_file, - num_workers=num_workers, - preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False), - collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False), - allow_variable_data_keys=allow_variable_data_keys, - inference=True, - ) - - finish_count = 0 - file_count = 1 - # 7 .Start for-loop - # FIXME(kamo): The output format should be discussed about - output_path = output_dir_v2 if output_dir_v2 is not None else output_dir - if output_path is not None: - writer = DatadirWriter(output_path) - ibest_writer = writer[f"1best_recog"] - else: - writer = None - ibest_writer = None - - vad_results = [] - for keys, batch in loader: - assert isinstance(batch, dict), type(batch) - assert all(isinstance(s, str) for s in keys), keys - _bs = len(next(iter(batch.values()))) - assert len(keys) == _bs, f"{len(keys)} != {_bs}" - - # do vad segment - _, results = speech2vadsegment(**batch) - for i, _ in enumerate(keys): - if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas": - results[i] = json.dumps(results[i]) - item = {'key': keys[i], 'value': results[i]} - vad_results.append(item) - if writer is not None: - ibest_writer["text"][keys[i]] = "{}".format(results[i]) - - return vad_results - - return _forward - -def inference_modelscope_online( - batch_size: int, - ngpu: int, - log_level: Union[int, str], - # data_path_and_name_and_type, - vad_infer_config: Optional[str], - vad_model_file: Optional[str], - vad_cmvn_file: Optional[str] = None, - # raw_inputs: Union[np.ndarray, torch.Tensor] = None, - key_file: Optional[str] = None, - allow_variable_data_keys: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - num_workers: int = 1, - **kwargs, -): - assert check_argument_types() - - - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - batch_size = 1 - - # 1. Set random-seed - set_all_random_seed(seed) - - # 2. Build speech2vadsegment - speech2vadsegment_kwargs = dict( - vad_infer_config=vad_infer_config, - vad_model_file=vad_model_file, - vad_cmvn_file=vad_cmvn_file, - device=device, - dtype=dtype, - ) - logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs)) - speech2vadsegment = Speech2VadSegmentOnline(**speech2vadsegment_kwargs) - - def _forward( - data_path_and_name_and_type, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - output_dir_v2: Optional[str] = None, - fs: dict = None, - param_dict: dict = None, - ): - # 3. Build data-iterator - if data_path_and_name_and_type is None and raw_inputs is not None: - if isinstance(raw_inputs, torch.Tensor): - raw_inputs = raw_inputs.numpy() - data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] - loader = VADTask.build_streaming_iterator( - data_path_and_name_and_type, - dtype=dtype, - batch_size=batch_size, - key_file=key_file, - num_workers=num_workers, - preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False), - collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False), - allow_variable_data_keys=allow_variable_data_keys, - inference=True, - ) - - finish_count = 0 - file_count = 1 - # 7 .Start for-loop - # FIXME(kamo): The output format should be discussed about - output_path = output_dir_v2 if output_dir_v2 is not None else output_dir - if output_path is not None: - writer = DatadirWriter(output_path) - ibest_writer = writer[f"1best_recog"] - else: - writer = None - ibest_writer = None - - vad_results = [] - batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict() - is_final = param_dict.get('is_final', False) if param_dict is not None else False - max_end_sil = param_dict.get('max_end_sil', 800) if param_dict is not None else 800 - for keys, batch in loader: - assert isinstance(batch, dict), type(batch) - assert all(isinstance(s, str) for s in keys), keys - _bs = len(next(iter(batch.values()))) - assert len(keys) == _bs, f"{len(keys)} != {_bs}" - batch['in_cache'] = batch_in_cache - batch['is_final'] = is_final - batch['max_end_sil'] = max_end_sil - - # do vad segment - _, results, param_dict['in_cache'] = speech2vadsegment(**batch) - # param_dict['in_cache'] = batch['in_cache'] - if results: - for i, _ in enumerate(keys): - if results[i]: - if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas": - results[i] = json.dumps(results[i]) - item = {'key': keys[i], 'value': results[i]} - vad_results.append(item) - if writer is not None: - ibest_writer["text"][keys[i]] = "{}".format(results[i]) - - return vad_results - - return _forward - -def get_parser(): - parser = config_argparse.ArgumentParser( - description="VAD Decoding", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Note(kamo): Use '_' instead of '-' as separator. - # '-' is confusing if written in yaml. - parser.add_argument( - "--log_level", - type=lambda x: x.upper(), - default="INFO", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), - help="The verbose level of logging", - ) - - parser.add_argument("--output_dir", type=str, required=False) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - parser.add_argument( - "--gpuid_list", - type=str, - default="", - help="The visible gpus", - ) - parser.add_argument("--seed", type=int, default=0, help="Random seed") - parser.add_argument( - "--dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type", - ) - parser.add_argument( - "--num_workers", - type=int, - default=1, - help="The number of workers used for DataLoader", - ) - - group = parser.add_argument_group("Input data related") - group.add_argument( - "--data_path_and_name_and_type", - type=str2triple_str, - required=False, - action="append", - ) - group.add_argument("--raw_inputs", type=list, default=None) - # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}]) - group.add_argument("--key_file", type=str_or_none) - group.add_argument("--allow_variable_data_keys", type=str2bool, default=False) - - group = parser.add_argument_group("The model configuration related") - group.add_argument( - "--vad_infer_config", - type=str, - help="VAD infer configuration", - ) - group.add_argument( - "--vad_model_file", - type=str, - help="VAD model parameter file", - ) - group.add_argument( - "--vad_cmvn_file", - type=str, - help="Global cmvn file", - ) - group.add_argument( - "--online", - type=str, - help="decoding mode", - ) - - group = parser.add_argument_group("infer related") - group.add_argument( - "--batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - - return parser - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - parser = get_parser() - args = parser.parse_args(cmd) - kwargs = vars(args) - kwargs.pop("config", None) - inference(**kwargs) - - -if __name__ == "__main__": - main() - diff --git a/funasr/bin/vad_inference_launch.py b/funasr/bin/vad_inference_launch.py index de589259f..2ccc71691 100644 --- a/funasr/bin/vad_inference_launch.py +++ b/funasr/bin/vad_inference_launch.py @@ -17,6 +17,255 @@ from funasr.utils.types import str2bool from funasr.utils.types import str2triple_str from funasr.utils.types import str_or_none +import argparse +import logging +import os +import sys +import json +from pathlib import Path +from typing import Any +from typing import List +from typing import Optional +from typing import Sequence +from typing import Tuple +from typing import Union +from typing import Dict + +import math +import numpy as np +import torch +from typeguard import check_argument_types +from typeguard import check_return_type + +from funasr.fileio.datadir_writer import DatadirWriter +from funasr.modules.scorers.scorer_interface import BatchScorerInterface +from funasr.modules.subsampling import TooShortUttError +from funasr.tasks.vad import VADTask +from funasr.torch_utils.device_funcs import to_device +from funasr.torch_utils.set_all_random_seed import set_all_random_seed +from funasr.utils import config_argparse +from funasr.utils.cli_utils import get_commandline_args +from funasr.utils.types import str2bool +from funasr.utils.types import str2triple_str +from funasr.utils.types import str_or_none +from funasr.utils import asr_utils, wav_utils, postprocess_utils +from funasr.models.frontend.wav_frontend import WavFrontend, WavFrontendOnline +from funasr.bin.vad_infer import Speech2VadSegment, Speech2VadSegmentOnline + +def inference_vad( + batch_size: int, + ngpu: int, + log_level: Union[int, str], + # data_path_and_name_and_type, + vad_infer_config: Optional[str], + vad_model_file: Optional[str], + vad_cmvn_file: Optional[str] = None, + # raw_inputs: Union[np.ndarray, torch.Tensor] = None, + key_file: Optional[str] = None, + allow_variable_data_keys: bool = False, + output_dir: Optional[str] = None, + dtype: str = "float32", + seed: int = 0, + num_workers: int = 1, + **kwargs, +): + assert check_argument_types() + if batch_size > 1: + raise NotImplementedError("batch decoding is not implemented") + + + logging.basicConfig( + level=log_level, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + + if ngpu >= 1 and torch.cuda.is_available(): + device = "cuda" + else: + device = "cpu" + batch_size = 1 + # 1. Set random-seed + set_all_random_seed(seed) + + # 2. Build speech2vadsegment + speech2vadsegment_kwargs = dict( + vad_infer_config=vad_infer_config, + vad_model_file=vad_model_file, + vad_cmvn_file=vad_cmvn_file, + device=device, + dtype=dtype, + ) + logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs)) + speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs) + + def _forward( + data_path_and_name_and_type, + raw_inputs: Union[np.ndarray, torch.Tensor] = None, + output_dir_v2: Optional[str] = None, + fs: dict = None, + param_dict: dict = None + ): + # 3. Build data-iterator + if data_path_and_name_and_type is None and raw_inputs is not None: + if isinstance(raw_inputs, torch.Tensor): + raw_inputs = raw_inputs.numpy() + data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] + loader = VADTask.build_streaming_iterator( + data_path_and_name_and_type, + dtype=dtype, + batch_size=batch_size, + key_file=key_file, + num_workers=num_workers, + preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False), + collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False), + allow_variable_data_keys=allow_variable_data_keys, + inference=True, + ) + + finish_count = 0 + file_count = 1 + # 7 .Start for-loop + # FIXME(kamo): The output format should be discussed about + output_path = output_dir_v2 if output_dir_v2 is not None else output_dir + if output_path is not None: + writer = DatadirWriter(output_path) + ibest_writer = writer[f"1best_recog"] + else: + writer = None + ibest_writer = None + + vad_results = [] + for keys, batch in loader: + assert isinstance(batch, dict), type(batch) + assert all(isinstance(s, str) for s in keys), keys + _bs = len(next(iter(batch.values()))) + assert len(keys) == _bs, f"{len(keys)} != {_bs}" + + # do vad segment + _, results = speech2vadsegment(**batch) + for i, _ in enumerate(keys): + if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas": + results[i] = json.dumps(results[i]) + item = {'key': keys[i], 'value': results[i]} + vad_results.append(item) + if writer is not None: + ibest_writer["text"][keys[i]] = "{}".format(results[i]) + + return vad_results + + return _forward + +def inference_vad_online( + batch_size: int, + ngpu: int, + log_level: Union[int, str], + # data_path_and_name_and_type, + vad_infer_config: Optional[str], + vad_model_file: Optional[str], + vad_cmvn_file: Optional[str] = None, + # raw_inputs: Union[np.ndarray, torch.Tensor] = None, + key_file: Optional[str] = None, + allow_variable_data_keys: bool = False, + output_dir: Optional[str] = None, + dtype: str = "float32", + seed: int = 0, + num_workers: int = 1, + **kwargs, +): + assert check_argument_types() + + + logging.basicConfig( + level=log_level, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + + if ngpu >= 1 and torch.cuda.is_available(): + device = "cuda" + else: + device = "cpu" + batch_size = 1 + + # 1. Set random-seed + set_all_random_seed(seed) + + # 2. Build speech2vadsegment + speech2vadsegment_kwargs = dict( + vad_infer_config=vad_infer_config, + vad_model_file=vad_model_file, + vad_cmvn_file=vad_cmvn_file, + device=device, + dtype=dtype, + ) + logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs)) + speech2vadsegment = Speech2VadSegmentOnline(**speech2vadsegment_kwargs) + + def _forward( + data_path_and_name_and_type, + raw_inputs: Union[np.ndarray, torch.Tensor] = None, + output_dir_v2: Optional[str] = None, + fs: dict = None, + param_dict: dict = None, + ): + # 3. Build data-iterator + if data_path_and_name_and_type is None and raw_inputs is not None: + if isinstance(raw_inputs, torch.Tensor): + raw_inputs = raw_inputs.numpy() + data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] + loader = VADTask.build_streaming_iterator( + data_path_and_name_and_type, + dtype=dtype, + batch_size=batch_size, + key_file=key_file, + num_workers=num_workers, + preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False), + collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False), + allow_variable_data_keys=allow_variable_data_keys, + inference=True, + ) + + finish_count = 0 + file_count = 1 + # 7 .Start for-loop + # FIXME(kamo): The output format should be discussed about + output_path = output_dir_v2 if output_dir_v2 is not None else output_dir + if output_path is not None: + writer = DatadirWriter(output_path) + ibest_writer = writer[f"1best_recog"] + else: + writer = None + ibest_writer = None + + vad_results = [] + batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict() + is_final = param_dict.get('is_final', False) if param_dict is not None else False + max_end_sil = param_dict.get('max_end_sil', 800) if param_dict is not None else 800 + for keys, batch in loader: + assert isinstance(batch, dict), type(batch) + assert all(isinstance(s, str) for s in keys), keys + _bs = len(next(iter(batch.values()))) + assert len(keys) == _bs, f"{len(keys)} != {_bs}" + batch['in_cache'] = batch_in_cache + batch['is_final'] = is_final + batch['max_end_sil'] = max_end_sil + + # do vad segment + _, results, param_dict['in_cache'] = speech2vadsegment(**batch) + # param_dict['in_cache'] = batch['in_cache'] + if results: + for i, _ in enumerate(keys): + if results[i]: + if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas": + results[i] = json.dumps(results[i]) + item = {'key': keys[i], 'value': results[i]} + vad_results.append(item) + if writer is not None: + ibest_writer["text"][keys[i]] = "{}".format(results[i]) + + return vad_results + + return _forward + def get_parser(): parser = config_argparse.ArgumentParser( @@ -111,11 +360,9 @@ def get_parser(): def inference_launch(mode, **kwargs): if mode == "offline": - from funasr.bin.vad_inference import inference_modelscope - return inference_modelscope(**kwargs) + return inference_vad(**kwargs) elif mode == "online": - from funasr.bin.vad_inference import inference_modelscope_online - return inference_modelscope_online(**kwargs) + return inference_vad_online(**kwargs) else: logging.info("Unknown decoding mode: {}".format(mode)) return None @@ -147,8 +394,8 @@ def main(cmd=None): os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ["CUDA_VISIBLE_DEVICES"] = gpuid - inference_launch(**kwargs) - + inference_pipeline = inference_launch(**kwargs) + return inference_pipeline(kwargs["data_path_and_name_and_type"]) if __name__ == "__main__": main() diff --git a/funasr/bin/vad_inference_online.py b/funasr/bin/vad_inference_online.py deleted file mode 100644 index a3633093e..000000000 --- a/funasr/bin/vad_inference_online.py +++ /dev/null @@ -1,344 +0,0 @@ -import argparse -import logging -import os -import sys -import json -from pathlib import Path -from typing import Any -from typing import List -from typing import Optional -from typing import Sequence -from typing import Tuple -from typing import Union -from typing import Dict - -import numpy as np -import torch -from typeguard import check_argument_types -from typeguard import check_return_type - -from funasr.fileio.datadir_writer import DatadirWriter -from funasr.tasks.vad import VADTask -from funasr.torch_utils.device_funcs import to_device -from funasr.torch_utils.set_all_random_seed import set_all_random_seed -from funasr.utils import config_argparse -from funasr.utils.cli_utils import get_commandline_args -from funasr.utils.types import str2bool -from funasr.utils.types import str2triple_str -from funasr.utils.types import str_or_none -from funasr.models.frontend.wav_frontend import WavFrontendOnline -from funasr.models.frontend.wav_frontend import WavFrontend -from funasr.bin.vad_inference import Speech2VadSegment - -header_colors = '\033[95m' -end_colors = '\033[0m' - - -class Speech2VadSegmentOnline(Speech2VadSegment): - """Speech2VadSegmentOnline class - - Examples: - >>> import soundfile - >>> speech2segment = Speech2VadSegmentOnline("vad_config.yml", "vad.pt") - >>> audio, rate = soundfile.read("speech.wav") - >>> speech2segment(audio) - [[10, 230], [245, 450], ...] - - """ - def __init__(self, **kwargs): - super(Speech2VadSegmentOnline, self).__init__(**kwargs) - vad_cmvn_file = kwargs.get('vad_cmvn_file', None) - self.frontend = None - if self.vad_infer_args.frontend is not None: - self.frontend = WavFrontendOnline(cmvn_file=vad_cmvn_file, **self.vad_infer_args.frontend_conf) - - - @torch.no_grad() - def __call__( - self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None, - in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False, max_end_sil: int = 800 - ) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]: - """Inference - - Args: - speech: Input speech data - Returns: - text, token, token_int, hyp - - """ - assert check_argument_types() - - # Input as audio signal - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - batch_size = speech.shape[0] - segments = [[]] * batch_size - if self.frontend is not None: - feats, feats_len = self.frontend.forward(speech, speech_lengths, is_final) - fbanks, _ = self.frontend.get_fbank() - else: - raise Exception("Need to extract feats first, please configure frontend configuration") - if feats.shape[0]: - feats = to_device(feats, device=self.device) - feats_len = feats_len.int() - waveforms = self.frontend.get_waveforms() - - batch = { - "feats": feats, - "waveform": waveforms, - "in_cache": in_cache, - "is_final": is_final, - "max_end_sil": max_end_sil - } - # a. To device - batch = to_device(batch, device=self.device) - segments, in_cache = self.vad_model.forward_online(**batch) - # in_cache.update(batch['in_cache']) - # in_cache = {key: value for key, value in batch['in_cache'].items()} - return fbanks, segments, in_cache - - -def inference( - batch_size: int, - ngpu: int, - log_level: Union[int, str], - data_path_and_name_and_type, - vad_infer_config: Optional[str], - vad_model_file: Optional[str], - vad_cmvn_file: Optional[str] = None, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - key_file: Optional[str] = None, - allow_variable_data_keys: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - num_workers: int = 1, - **kwargs, -): - inference_pipeline = inference_modelscope( - batch_size=batch_size, - ngpu=ngpu, - log_level=log_level, - vad_infer_config=vad_infer_config, - vad_model_file=vad_model_file, - vad_cmvn_file=vad_cmvn_file, - key_file=key_file, - allow_variable_data_keys=allow_variable_data_keys, - output_dir=output_dir, - dtype=dtype, - seed=seed, - num_workers=num_workers, - **kwargs, - ) - return inference_pipeline(data_path_and_name_and_type, raw_inputs) - - -def inference_modelscope( - batch_size: int, - ngpu: int, - log_level: Union[int, str], - # data_path_and_name_and_type, - vad_infer_config: Optional[str], - vad_model_file: Optional[str], - vad_cmvn_file: Optional[str] = None, - # raw_inputs: Union[np.ndarray, torch.Tensor] = None, - key_file: Optional[str] = None, - allow_variable_data_keys: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - num_workers: int = 1, - **kwargs, -): - assert check_argument_types() - ncpu = kwargs.get("ncpu", 1) - torch.set_num_threads(ncpu) - - if batch_size > 1: - raise NotImplementedError("batch decoding is not implemented") - - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - batch_size = 1 - # 1. Set random-seed - set_all_random_seed(seed) - - # 2. Build speech2vadsegment - speech2vadsegment_kwargs = dict( - vad_infer_config=vad_infer_config, - vad_model_file=vad_model_file, - vad_cmvn_file=vad_cmvn_file, - device=device, - dtype=dtype, - ) - logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs)) - speech2vadsegment = Speech2VadSegmentOnline(**speech2vadsegment_kwargs) - - def _forward( - data_path_and_name_and_type, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - output_dir_v2: Optional[str] = None, - fs: dict = None, - param_dict: dict = None, - ): - # 3. Build data-iterator - if data_path_and_name_and_type is None and raw_inputs is not None: - if isinstance(raw_inputs, torch.Tensor): - raw_inputs = raw_inputs.numpy() - data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] - loader = VADTask.build_streaming_iterator( - data_path_and_name_and_type, - dtype=dtype, - batch_size=batch_size, - key_file=key_file, - num_workers=num_workers, - preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False), - collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False), - allow_variable_data_keys=allow_variable_data_keys, - inference=True, - ) - - finish_count = 0 - file_count = 1 - # 7 .Start for-loop - # FIXME(kamo): The output format should be discussed about - output_path = output_dir_v2 if output_dir_v2 is not None else output_dir - if output_path is not None: - writer = DatadirWriter(output_path) - ibest_writer = writer[f"1best_recog"] - else: - writer = None - ibest_writer = None - - vad_results = [] - batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict() - is_final = param_dict.get('is_final', False) if param_dict is not None else False - max_end_sil = param_dict.get('max_end_sil', 800) if param_dict is not None else 800 - for keys, batch in loader: - assert isinstance(batch, dict), type(batch) - assert all(isinstance(s, str) for s in keys), keys - _bs = len(next(iter(batch.values()))) - assert len(keys) == _bs, f"{len(keys)} != {_bs}" - batch['in_cache'] = batch_in_cache - batch['is_final'] = is_final - batch['max_end_sil'] = max_end_sil - - # do vad segment - _, results, param_dict['in_cache'] = speech2vadsegment(**batch) - # param_dict['in_cache'] = batch['in_cache'] - if results: - for i, _ in enumerate(keys): - if results[i]: - if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas": - results[i] = json.dumps(results[i]) - item = {'key': keys[i], 'value': results[i]} - vad_results.append(item) - if writer is not None: - ibest_writer["text"][keys[i]] = "{}".format(results[i]) - - return vad_results - - return _forward - - -def get_parser(): - parser = config_argparse.ArgumentParser( - description="VAD Decoding", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Note(kamo): Use '_' instead of '-' as separator. - # '-' is confusing if written in yaml. - parser.add_argument( - "--log_level", - type=lambda x: x.upper(), - default="INFO", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), - help="The verbose level of logging", - ) - - parser.add_argument("--output_dir", type=str, required=False) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - parser.add_argument( - "--gpuid_list", - type=str, - default="", - help="The visible gpus", - ) - parser.add_argument("--seed", type=int, default=0, help="Random seed") - parser.add_argument( - "--dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type", - ) - parser.add_argument( - "--num_workers", - type=int, - default=1, - help="The number of workers used for DataLoader", - ) - - group = parser.add_argument_group("Input data related") - group.add_argument( - "--data_path_and_name_and_type", - type=str2triple_str, - required=False, - action="append", - ) - group.add_argument("--raw_inputs", type=list, default=None) - # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}]) - group.add_argument("--key_file", type=str_or_none) - group.add_argument("--allow_variable_data_keys", type=str2bool, default=False) - - group = parser.add_argument_group("The model configuration related") - group.add_argument( - "--vad_infer_config", - type=str, - help="VAD infer configuration", - ) - group.add_argument( - "--vad_model_file", - type=str, - help="VAD model parameter file", - ) - group.add_argument( - "--vad_cmvn_file", - type=str, - help="Global cmvn file", - ) - - group = parser.add_argument_group("infer related") - group.add_argument( - "--batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - - return parser - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - parser = get_parser() - args = parser.parse_args(cmd) - kwargs = vars(args) - kwargs.pop("config", None) - inference(**kwargs) - - -if __name__ == "__main__": - main() diff --git a/funasr/version.txt b/funasr/version.txt index 4b9fcbec1..cb0c939a9 100644 --- a/funasr/version.txt +++ b/funasr/version.txt @@ -1 +1 @@ -0.5.1 +0.5.2