diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/finetune.py
deleted file mode 100644
index 3fa3f9d26..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/finetune.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-from funasr.datasets.ms_dataset import MsDataset
-
-
-def modelscope_finetune(params):
- if not os.path.exists(params["output_dir"]):
- os.makedirs(params["output_dir"], exist_ok=True)
- # dataset split ["train", "validation"]
- ds_dict = MsDataset.load(params["data_dir"])
- kwargs = dict(
- model=params["model"],
- model_revision=params["model_revision"],
- data_dir=ds_dict,
- dataset_type=params["dataset_type"],
- work_dir=params["output_dir"],
- batch_bins=params["batch_bins"],
- max_epoch=params["max_epoch"],
- lr=params["lr"])
- trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
- trainer.train()
-
-
-if __name__ == '__main__':
- params = {}
- params["output_dir"] = "./checkpoint"
- params["data_dir"] = "./data"
- params["batch_bins"] = 2000
- params["dataset_type"] = "small"
- params["max_epoch"] = 50
- params["lr"] = 0.00005
- params["model"] = "damo/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online"
- params["model_revision"] = None
- modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py
deleted file mode 100644
index 862f88198..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == "__main__":
- audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cantonese-CHS.wav"
- output_dir = "./results"
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
- output_dir=output_dir,
- )
- rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
- print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/README.md
deleted file mode 100644
index c68a8cd4f..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained Paraformer-large Model
-
-### Finetune
-
-- Modify finetune training related parameters in `finetune.py`
- - output_dir: # result dir
- - data_dir: # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
- - batch_bins: # batch size
- - max_epoch: # number of training epoch
- - lr: # learning rate
-
-- Then you can run the pipeline to finetune with:
-```python
- python finetune.py
-```
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.py`
- - audio_in: # support wav, url, bytes, and parsed audio format.
- - output_dir: # If the input format is wav.scp, it needs to be set.
-
-- Then you can run the pipeline to infer with:
-```python
- python infer.py
-```
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/finetune.py
deleted file mode 100644
index f15e3b968..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/finetune.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-from funasr.datasets.ms_dataset import MsDataset
-
-
-def modelscope_finetune(params):
- if not os.path.exists(params.output_dir):
- os.makedirs(params.output_dir, exist_ok=True)
- # dataset split ["train", "validation"]
- ds_dict = MsDataset.load(params.data_path)
- kwargs = dict(
- model=params.model,
- model_revision=params.model_revision,
- data_dir=ds_dict,
- dataset_type=params.dataset_type,
- work_dir=params.output_dir,
- batch_bins=params.batch_bins,
- max_epoch=params.max_epoch,
- lr=params.lr)
- trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
- trainer.train()
-
-
-if __name__ == '__main__':
- from funasr.utils.modelscope_param import modelscope_args
- params = modelscope_args(model="speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline", data_path="./data")
- params.output_dir = "./checkpoint" # m模型保存路径
- params.data_path = "./example_data/" # 数据路径
- params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large
- params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒,
- params.max_epoch = 20 # 最大训练轮数
- params.lr = 0.00005 # 设置学习率
-
- modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/infer.py
deleted file mode 100644
index 347d31694..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/infer.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == '__main__':
- audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
- output_dir = None
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline",
- output_dir=output_dir,
- )
- rec_result = inference_pipeline(audio_in=audio_in)
- print(rec_result)
-
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/finetune.py
deleted file mode 100644
index 68d7ba81e..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/finetune.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-from funasr.datasets.ms_dataset import MsDataset
-
-
-def modelscope_finetune(params):
- if not os.path.exists(params["output_dir"]):
- os.makedirs(params["output_dir"], exist_ok=True)
- # dataset split ["train", "validation"]
- ds_dict = MsDataset.load(params["data_dir"])
- kwargs = dict(
- model=params["model"],
- model_revision=params["model_revision"],
- data_dir=ds_dict,
- dataset_type=params["dataset_type"],
- work_dir=params["output_dir"],
- batch_bins=params["batch_bins"],
- max_epoch=params["max_epoch"],
- lr=params["lr"])
- trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
- trainer.train()
-
-
-if __name__ == '__main__':
- params = {}
- params["output_dir"] = "./checkpoint"
- params["data_dir"] = "./data"
- params["batch_bins"] = 2000
- params["dataset_type"] = "small"
- params["max_epoch"] = 50
- params["lr"] = 0.00005
- params["model"] = "damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline"
- params["model_revision"] = None
- modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py
deleted file mode 100644
index f82c1f4c4..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == "__main__":
- audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_de.wav"
- output_dir = "./results"
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline",
- output_dir=output_dir,
- )
- rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
- print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/finetune.py
deleted file mode 100644
index 397b7ffcf..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/finetune.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-from funasr.datasets.ms_dataset import MsDataset
-
-
-def modelscope_finetune(params):
- if not os.path.exists(params["output_dir"]):
- os.makedirs(params["output_dir"], exist_ok=True)
- # dataset split ["train", "validation"]
- ds_dict = MsDataset.load(params["data_dir"])
- kwargs = dict(
- model=params["model"],
- model_revision=params["model_revision"],
- data_dir=ds_dict,
- dataset_type=params["dataset_type"],
- work_dir=params["output_dir"],
- batch_bins=params["batch_bins"],
- max_epoch=params["max_epoch"],
- lr=params["lr"])
- trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
- trainer.train()
-
-
-if __name__ == '__main__':
- params = {}
- params["output_dir"] = "./checkpoint"
- params["data_dir"] = "./data"
- params["batch_bins"] = 2000
- params["dataset_type"] = "small"
- params["max_epoch"] = 50
- params["lr"] = 0.00005
- params["model"] = "damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline"
- params["model_revision"] = None
- modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py
deleted file mode 100644
index 98f31b602..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == "__main__":
- audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav"
- output_dir = "./results"
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline",
- output_dir=output_dir,
- )
- rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
- print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/finetune.py
deleted file mode 100644
index 3846ff620..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/finetune.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-from funasr.datasets.ms_dataset import MsDataset
-
-
-def modelscope_finetune(params):
- if not os.path.exists(params["output_dir"]):
- os.makedirs(params["output_dir"], exist_ok=True)
- # dataset split ["train", "validation"]
- ds_dict = MsDataset.load(params["data_dir"])
- kwargs = dict(
- model=params["model"],
- model_revision=params["model_revision"],
- data_dir=ds_dict,
- dataset_type=params["dataset_type"],
- work_dir=params["output_dir"],
- batch_bins=params["batch_bins"],
- max_epoch=params["max_epoch"],
- lr=params["lr"])
- trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
- trainer.train()
-
-
-if __name__ == '__main__':
- params = {}
- params["output_dir"] = "./checkpoint"
- params["data_dir"] = "./data"
- params["batch_bins"] = 2000
- params["dataset_type"] = "small"
- params["max_epoch"] = 50
- params["lr"] = 0.00005
- params["model"] = "damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline"
- params["model_revision"] = None
- modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py
deleted file mode 100644
index 75e22a0e9..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == "__main__":
- audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_es.wav"
- output_dir = "./results"
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline",
- output_dir=output_dir,
- )
- rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
- print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
deleted file mode 100644
index b68f1e921..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
+++ /dev/null
@@ -1,53 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained Paraformer-large Model
-
-### Finetune
-
-- Modify finetune training related parameters in `finetune.py`
- - output_dir: # result dir
- - data_dir: # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
- - dataset_type: # for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
- - batch_bins: # batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms
- - max_epoch: # number of training epoch
- - lr: # learning rate
-
-- Then you can run the pipeline to finetune with:
-```python
- python finetune.py
-```
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.py`
- - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- - output_dir: # result dir
- - ngpu: # the number of GPUs for decoding
- - njob: # the number of jobs for each GPU
-
-- Then you can run the pipeline to infer with:
-```python
- python infer.py
-```
-
-- Results
-
-The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
-
-### Inference using local finetuned model
-
-- Modify inference related parameters in `infer_after_finetune.py`
- - output_dir: # result dir
- - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
-
-- Then you can run the pipeline to finetune with:
-```python
- python infer_after_finetune.py
-```
-
-- Results
-
-The decoding results can be found in `$output_dir/decoding_results/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/finetune.py
deleted file mode 100644
index 2ecc22917..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/finetune.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import os
-
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-
-from funasr.datasets.ms_dataset import MsDataset
-from funasr.utils.modelscope_param import modelscope_args
-
-
-def modelscope_finetune(params):
- if not os.path.exists(params.output_dir):
- os.makedirs(params.output_dir, exist_ok=True)
- # dataset split ["train", "validation"]
- ds_dict = MsDataset.load(params.data_path)
- kwargs = dict(
- model=params.model,
- data_dir=ds_dict,
- dataset_type=params.dataset_type,
- work_dir=params.output_dir,
- batch_bins=params.batch_bins,
- max_epoch=params.max_epoch,
- lr=params.lr)
- trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
- trainer.train()
-
-
-if __name__ == '__main__':
- params = modelscope_args(model="damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline", data_path="./data")
- params.output_dir = "./checkpoint" # m模型保存路径
- params.data_path = "./example_data/" # 数据路径
- params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large
- params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒,
- params.max_epoch = 20 # 最大训练轮数
- params.lr = 0.00005 # 设置学习率
-
- modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py
deleted file mode 100644
index e6c39c2b8..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py
+++ /dev/null
@@ -1,89 +0,0 @@
-import os
-import shutil
-from multiprocessing import Pool
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-from funasr.utils.compute_wer import compute_wer
-
-
-def modelscope_infer_core(output_dir, split_dir, njob, idx):
- output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
- gpu_id = (int(idx) - 1) // njob
- if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
- gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
- os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
- else:
- os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline",
- output_dir=output_dir_job,
- batch_size=1
- )
- audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
- inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
-
-
-def modelscope_infer(params):
- # prepare for multi-GPU decoding
- ngpu = params["ngpu"]
- njob = params["njob"]
- output_dir = params["output_dir"]
- if os.path.exists(output_dir):
- shutil.rmtree(output_dir)
- os.mkdir(output_dir)
- split_dir = os.path.join(output_dir, "split")
- os.mkdir(split_dir)
- nj = ngpu * njob
- wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
- with open(wav_scp_file) as f:
- lines = f.readlines()
- num_lines = len(lines)
- num_job_lines = num_lines // nj
- start = 0
- for i in range(nj):
- end = start + num_job_lines
- file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1)))
- with open(file, "w") as f:
- if i == nj - 1:
- f.writelines(lines[start:])
- else:
- f.writelines(lines[start:end])
- start = end
-
- p = Pool(nj)
- for i in range(nj):
- p.apply_async(modelscope_infer_core,
- args=(output_dir, split_dir, njob, str(i + 1)))
- p.close()
- p.join()
-
- # combine decoding results
- best_recog_path = os.path.join(output_dir, "1best_recog")
- os.mkdir(best_recog_path)
- files = ["text", "token", "score"]
- for file in files:
- with open(os.path.join(best_recog_path, file), "w") as f:
- for i in range(nj):
- job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file)
- with open(job_file) as f_job:
- lines = f_job.readlines()
- f.writelines(lines)
-
- # If text exists, compute CER
- text_in = os.path.join(params["data_dir"], "text")
- if os.path.exists(text_in):
- text_proc_file = os.path.join(best_recog_path, "token")
- compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
- os.system("tail -n 3 {}".format(os.path.join(best_recog_path, "text.cer")))
-
-
-if __name__ == "__main__":
- params = {}
- params["data_dir"] = "./data/test"
- params["output_dir"] = "./results"
- params["ngpu"] = 1
- params["njob"] = 8
- modelscope_infer(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
deleted file mode 100644
index 6593f4e3f..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
+++ /dev/null
@@ -1,54 +0,0 @@
-import json
-import os
-import shutil
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-from funasr.utils.compute_wer import compute_wer
-
-
-def modelscope_infer_after_finetune(params):
- # prepare for decoding
- pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
- for file_name in params["required_files"]:
- if file_name == "configuration.json":
- with open(os.path.join(pretrained_model_path, file_name)) as f:
- config_dict = json.load(f)
- config_dict["model"]["am_model_name"] = params["decoding_model_name"]
- with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
- json.dump(config_dict, f, indent=4, separators=(',', ': '))
- else:
- shutil.copy(os.path.join(pretrained_model_path, file_name),
- os.path.join(params["output_dir"], file_name))
- decoding_path = os.path.join(params["output_dir"], "decode_results")
- if os.path.exists(decoding_path):
- shutil.rmtree(decoding_path)
- os.mkdir(decoding_path)
-
- # decoding
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model=params["output_dir"],
- output_dir=decoding_path,
- batch_size=1
- )
- audio_in = os.path.join(params["data_dir"], "wav.scp")
- inference_pipeline(audio_in=audio_in)
-
- # computer CER if GT text is set
- text_in = os.path.join(params["data_dir"], "text")
- if os.path.exists(text_in):
- text_proc_file = os.path.join(decoding_path, "1best_recog/token")
- compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
- os.system("tail -n 3 {}".format(os.path.join(decoding_path, "text.cer")))
-
-
-if __name__ == '__main__':
- params = {}
- params["modelscope_model_name"] = "damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline"
- params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
- params["output_dir"] = "./checkpoint"
- params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "20epoch.pb"
- modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/finetune.py
deleted file mode 100644
index 4746cc2da..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/finetune.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-from funasr.datasets.ms_dataset import MsDataset
-
-
-def modelscope_finetune(params):
- if not os.path.exists(params["output_dir"]):
- os.makedirs(params["output_dir"], exist_ok=True)
- # dataset split ["train", "validation"]
- ds_dict = MsDataset.load(params["data_dir"])
- kwargs = dict(
- model=params["model"],
- model_revision=params["model_revision"],
- data_dir=ds_dict,
- dataset_type=params["dataset_type"],
- work_dir=params["output_dir"],
- batch_bins=params["batch_bins"],
- max_epoch=params["max_epoch"],
- lr=params["lr"])
- trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
- trainer.train()
-
-
-if __name__ == '__main__':
- params = {}
- params["output_dir"] = "./checkpoint"
- params["data_dir"] = "./data"
- params["batch_bins"] = 2000
- params["dataset_type"] = "small"
- params["max_epoch"] = 50
- params["lr"] = 0.00005
- params["model"] = "damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline"
- params["model_revision"] = None
- modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py
deleted file mode 100644
index 627d132fc..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == "__main__":
- audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_fr.wav"
- output_dir = "./results"
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline",
- output_dir=output_dir,
- )
- rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
- print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/finetune.py
deleted file mode 100644
index 985b838ab..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/finetune.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-from funasr.datasets.ms_dataset import MsDataset
-
-
-def modelscope_finetune(params):
- if not os.path.exists(params["output_dir"]):
- os.makedirs(params["output_dir"], exist_ok=True)
- # dataset split ["train", "validation"]
- ds_dict = MsDataset.load(params["data_dir"])
- kwargs = dict(
- model=params["model"],
- model_revision=params["model_revision"],
- data_dir=ds_dict,
- dataset_type=params["dataset_type"],
- work_dir=params["output_dir"],
- batch_bins=params["batch_bins"],
- max_epoch=params["max_epoch"],
- lr=params["lr"])
- trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
- trainer.train()
-
-
-if __name__ == '__main__':
- params = {}
- params["output_dir"] = "./checkpoint"
- params["data_dir"] = "./data"
- params["batch_bins"] = 2000
- params["dataset_type"] = "small"
- params["max_epoch"] = 50
- params["lr"] = 0.00005
- params["model"] = "damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online"
- params["model_revision"] = None
- modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py
deleted file mode 100644
index e53c37e60..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == "__main__":
- audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_id.wav"
- output_dir = "./results"
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online",
- output_dir=output_dir,
- )
- rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
- print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/finetune.py
deleted file mode 100644
index 5485ff56e..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/finetune.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-from funasr.datasets.ms_dataset import MsDataset
-
-
-def modelscope_finetune(params):
- if not os.path.exists(params["output_dir"]):
- os.makedirs(params["output_dir"], exist_ok=True)
- # dataset split ["train", "validation"]
- ds_dict = MsDataset.load(params["data_dir"])
- kwargs = dict(
- model=params["model"],
- model_revision=params["model_revision"],
- data_dir=ds_dict,
- dataset_type=params["dataset_type"],
- work_dir=params["output_dir"],
- batch_bins=params["batch_bins"],
- max_epoch=params["max_epoch"],
- lr=params["lr"])
- trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
- trainer.train()
-
-
-if __name__ == '__main__':
- params = {}
- params["output_dir"] = "./checkpoint"
- params["data_dir"] = "./data"
- params["batch_bins"] = 2000
- params["dataset_type"] = "small"
- params["max_epoch"] = 50
- params["lr"] = 0.00005
- params["model"] = "damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline"
- params["model_revision"] = None
- modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py
deleted file mode 100644
index 68cc41d54..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == "__main__":
- audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ja.wav"
- output_dir = "./results"
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline",
- output_dir=output_dir,
- )
- rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
- print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/finetune.py
deleted file mode 100644
index fd9c44294..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/finetune.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-from funasr.datasets.ms_dataset import MsDataset
-
-
-def modelscope_finetune(params):
- if not os.path.exists(params["output_dir"]):
- os.makedirs(params["output_dir"], exist_ok=True)
- # dataset split ["train", "validation"]
- ds_dict = MsDataset.load(params["data_dir"])
- kwargs = dict(
- model=params["model"],
- model_revision=params["model_revision"],
- data_dir=ds_dict,
- dataset_type=params["dataset_type"],
- work_dir=params["output_dir"],
- batch_bins=params["batch_bins"],
- max_epoch=params["max_epoch"],
- lr=params["lr"])
- trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
- trainer.train()
-
-
-if __name__ == '__main__':
- params = {}
- params["output_dir"] = "./checkpoint"
- params["data_dir"] = "./data"
- params["batch_bins"] = 2000
- params["dataset_type"] = "small"
- params["max_epoch"] = 50
- params["lr"] = 0.00005
- params["model"] = "damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline"
- params["model_revision"] = None
- modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py
deleted file mode 100644
index b87bcbb84..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == "__main__":
- audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ko.wav"
- output_dir = "./results"
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline",
- output_dir=output_dir,
- )
- rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
- print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/finetune.py
deleted file mode 100644
index 512b844c6..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/finetune.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-from funasr.datasets.ms_dataset import MsDataset
-
-
-def modelscope_finetune(params):
- if not os.path.exists(params["output_dir"]):
- os.makedirs(params["output_dir"], exist_ok=True)
- # dataset split ["train", "validation"]
- ds_dict = MsDataset.load(params["data_dir"])
- kwargs = dict(
- model=params["model"],
- model_revision=params["model_revision"],
- data_dir=ds_dict,
- dataset_type=params["dataset_type"],
- work_dir=params["output_dir"],
- batch_bins=params["batch_bins"],
- max_epoch=params["max_epoch"],
- lr=params["lr"])
- trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
- trainer.train()
-
-
-if __name__ == '__main__':
- params = {}
- params["output_dir"] = "./checkpoint"
- params["data_dir"] = "./data"
- params["batch_bins"] = 2000
- params["dataset_type"] = "small"
- params["max_epoch"] = 50
- params["lr"] = 0.00005
- params["model"] = "damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline"
- params["model_revision"] = None
- modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py
deleted file mode 100644
index 4a43e7ce5..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == "__main__":
- audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_pt.wav"
- output_dir = "./results"
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline",
- output_dir=output_dir,
- )
- rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
- print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/finetune.py
deleted file mode 100644
index 432266dc8..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/finetune.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-from funasr.datasets.ms_dataset import MsDataset
-
-
-def modelscope_finetune(params):
- if not os.path.exists(params["output_dir"]):
- os.makedirs(params["output_dir"], exist_ok=True)
- # dataset split ["train", "validation"]
- ds_dict = MsDataset.load(params["data_dir"])
- kwargs = dict(
- model=params["model"],
- model_revision=params["model_revision"],
- data_dir=ds_dict,
- dataset_type=params["dataset_type"],
- work_dir=params["output_dir"],
- batch_bins=params["batch_bins"],
- max_epoch=params["max_epoch"],
- lr=params["lr"])
- trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
- trainer.train()
-
-
-if __name__ == '__main__':
- params = {}
- params["output_dir"] = "./checkpoint"
- params["data_dir"] = "./data"
- params["batch_bins"] = 2000
- params["dataset_type"] = "small"
- params["max_epoch"] = 50
- params["lr"] = 0.00005
- params["model"] = "damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline"
- params["model_revision"] = None
- modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py
deleted file mode 100644
index 3c9d364e9..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == "__main__":
- audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ru.wav"
- output_dir = "./results"
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline",
- output_dir=output_dir,
- )
- rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
- print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/finetune.py
deleted file mode 100644
index 3a90ed21f..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/finetune.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-from funasr.datasets.ms_dataset import MsDataset
-
-
-def modelscope_finetune(params):
- if not os.path.exists(params["output_dir"]):
- os.makedirs(params["output_dir"], exist_ok=True)
- # dataset split ["train", "validation"]
- ds_dict = MsDataset.load(params["data_dir"])
- kwargs = dict(
- model=params["model"],
- model_revision=params["model_revision"],
- data_dir=ds_dict,
- dataset_type=params["dataset_type"],
- work_dir=params["output_dir"],
- batch_bins=params["batch_bins"],
- max_epoch=params["max_epoch"],
- lr=params["lr"])
- trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
- trainer.train()
-
-
-if __name__ == '__main__':
- params = {}
- params["output_dir"] = "./checkpoint"
- params["data_dir"] = "./data"
- params["batch_bins"] = 2000
- params["dataset_type"] = "small"
- params["max_epoch"] = 50
- params["lr"] = 0.00005
- params["model"] = "damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline"
- params["model_revision"] = None
- modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py
deleted file mode 100644
index 4218f3d7a..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py
+++ /dev/null
@@ -1,13 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == "__main__":
- audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_vi.wav"
- output_dir = "./results"
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline",
- output_dir=output_dir,
- )
- rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
- print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/README.md
deleted file mode 100644
index c68a8cd4f..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained Paraformer-large Model
-
-### Finetune
-
-- Modify finetune training related parameters in `finetune.py`
- - output_dir: # result dir
- - data_dir: # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
- - batch_bins: # batch size
- - max_epoch: # number of training epoch
- - lr: # learning rate
-
-- Then you can run the pipeline to finetune with:
-```python
- python finetune.py
-```
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.py`
- - audio_in: # support wav, url, bytes, and parsed audio format.
- - output_dir: # If the input format is wav.scp, it needs to be set.
-
-- Then you can run the pipeline to infer with:
-```python
- python infer.py
-```
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/finetune.py
deleted file mode 100644
index 73aae7dd9..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/finetune.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-from funasr.datasets.ms_dataset import MsDataset
-
-
-def modelscope_finetune(params):
- if not os.path.exists(params.output_dir):
- os.makedirs(params.output_dir, exist_ok=True)
- # dataset split ["train", "validation"]
- ds_dict = MsDataset.load(params.data_path)
- kwargs = dict(
- model=params.model,
- model_revision=params.model_revision,
- data_dir=ds_dict,
- dataset_type=params.dataset_type,
- work_dir=params.output_dir,
- batch_bins=params.batch_bins,
- max_epoch=params.max_epoch,
- lr=params.lr)
- trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
- trainer.train()
-
-
-if __name__ == '__main__':
- from funasr.utils.modelscope_param import modelscope_args
- params = modelscope_args(model="damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline", data_path="./data")
- params.output_dir = "./checkpoint" # m模型保存路径
- params.data_path = "./example_data/" # 数据路径
- params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large
- params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒,
- params.max_epoch = 20 # 最大训练轮数
- params.lr = 0.00005 # 设置学习率
-
- modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/infer.py
deleted file mode 100644
index 35209896c..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/infer.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == '__main__':
- audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
- output_dir = None
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline",
- output_dir=output_dir,
- )
- rec_result = inference_pipeline(audio_in=audio_in)
- print(rec_result)
-
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
deleted file mode 100644
index 9a84f9b57..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
+++ /dev/null
@@ -1,53 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained UniASR Model
-
-### Finetune
-
-- Modify finetune training related parameters in `finetune.py`
- - output_dir: # result dir
- - data_dir: # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
- - dataset_type: # for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
- - batch_bins: # batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms
- - max_epoch: # number of training epoch
- - lr: # learning rate
-
-- Then you can run the pipeline to finetune with:
-```python
- python finetune.py
-```
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.py`
- - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- - output_dir: # result dir
- - ngpu: # the number of GPUs for decoding
- - njob: # the number of jobs for each GPU
-
-- Then you can run the pipeline to infer with:
-```python
- python infer.py
-```
-
-- Results
-
-The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
-
-### Inference using local finetuned model
-
-- Modify inference related parameters in `infer_after_finetune.py`
- - output_dir: # result dir
- - data_dir: # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- - decoding_model_name: # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
-
-- Then you can run the pipeline to finetune with:
-```python
- python infer_after_finetune.py
-```
-
-- Results
-
-The decoding results can be found in `$output_dir/decoding_results/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/finetune.py
deleted file mode 100644
index b2325b2bb..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/finetune.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import os
-
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-
-from funasr.datasets.ms_dataset import MsDataset
-from funasr.utils.modelscope_param import modelscope_args
-
-
-def modelscope_finetune(params):
- if not os.path.exists(params.output_dir):
- os.makedirs(params.output_dir, exist_ok=True)
- # dataset split ["train", "validation"]
- ds_dict = MsDataset.load(params.data_path)
- kwargs = dict(
- model=params.model,
- data_dir=ds_dict,
- dataset_type=params.dataset_type,
- work_dir=params.output_dir,
- batch_bins=params.batch_bins,
- max_epoch=params.max_epoch,
- lr=params.lr)
- trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
- trainer.train()
-
-
-if __name__ == '__main__':
- params = modelscope_args(model="damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline", data_path="./data")
- params.output_dir = "./checkpoint" # m模型保存路径
- params.data_path = "./example_data/" # 数据路径
- params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large
- params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒,
- params.max_epoch = 20 # 最大训练轮数
- params.lr = 0.00005 # 设置学习率
-
- modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py
deleted file mode 100644
index 13d2a2e37..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import os
-import shutil
-from multiprocessing import Pool
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-from funasr.utils.compute_wer import compute_wer
-
-
-def modelscope_infer_core(output_dir, split_dir, njob, idx):
- output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
- gpu_id = (int(idx) - 1) // njob
- if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
- gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
- os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
- else:
- os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline",
- output_dir=output_dir_job,
- batch_size=1
- )
- audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
- inference_pipeline(audio_in=audio_in)
-
-def modelscope_infer(params):
- # prepare for multi-GPU decoding
- ngpu = params["ngpu"]
- njob = params["njob"]
- output_dir = params["output_dir"]
- if os.path.exists(output_dir):
- shutil.rmtree(output_dir)
- os.mkdir(output_dir)
- split_dir = os.path.join(output_dir, "split")
- os.mkdir(split_dir)
- nj = ngpu * njob
- wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
- with open(wav_scp_file) as f:
- lines = f.readlines()
- num_lines = len(lines)
- num_job_lines = num_lines // nj
- start = 0
- for i in range(nj):
- end = start + num_job_lines
- file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1)))
- with open(file, "w") as f:
- if i == nj - 1:
- f.writelines(lines[start:])
- else:
- f.writelines(lines[start:end])
- start = end
-
- p = Pool(nj)
- for i in range(nj):
- p.apply_async(modelscope_infer_core,
- args=(output_dir, split_dir, njob, str(i + 1)))
- p.close()
- p.join()
-
- # combine decoding results
- best_recog_path = os.path.join(output_dir, "1best_recog")
- os.mkdir(best_recog_path)
- files = ["text", "token", "score"]
- for file in files:
- with open(os.path.join(best_recog_path, file), "w") as f:
- for i in range(nj):
- job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file)
- with open(job_file) as f_job:
- lines = f_job.readlines()
- f.writelines(lines)
-
- # If text exists, compute CER
- text_in = os.path.join(params["data_dir"], "text")
- if os.path.exists(text_in):
- text_proc_file = os.path.join(best_recog_path, "text")
- compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
-
-
-if __name__ == "__main__":
- params = {}
- params["data_dir"] = "./data/test"
- params["output_dir"] = "./results"
- params["ngpu"] = 1
- params["njob"] = 1
- modelscope_infer(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
deleted file mode 100644
index 1e9c4d1f0..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
+++ /dev/null
@@ -1,104 +0,0 @@
-import json
-import os
-import shutil
-
-from multiprocessing import Pool
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-from funasr.utils.compute_wer import compute_wer
-
-
-def modelscope_infer_after_finetune_core(model_dir, output_dir, split_dir, njob, idx):
- output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
- gpu_id = (int(idx) - 1) // njob
- if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
- gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
- os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
- else:
- os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model=model_dir,
- output_dir=output_dir_job,
- batch_size=1
- )
- audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
- inference_pipeline(audio_in=audio_in)
-
-def modelscope_infer_after_finetune(params):
- # prepare for multi-GPU decoding
- model_dir = params["model_dir"]
- pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
- for file_name in params["required_files"]:
- if file_name == "configuration.json":
- with open(os.path.join(pretrained_model_path, file_name)) as f:
- config_dict = json.load(f)
- config_dict["model"]["am_model_name"] = params["decoding_model_name"]
- with open(os.path.join(model_dir, "configuration.json"), "w") as f:
- json.dump(config_dict, f, indent=4, separators=(',', ': '))
- else:
- shutil.copy(os.path.join(pretrained_model_path, file_name),
- os.path.join(model_dir, file_name))
- ngpu = params["ngpu"]
- njob = params["njob"]
- output_dir = params["output_dir"]
- if os.path.exists(output_dir):
- shutil.rmtree(output_dir)
- os.mkdir(output_dir)
- split_dir = os.path.join(output_dir, "split")
- os.mkdir(split_dir)
- nj = ngpu * njob
- wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
- with open(wav_scp_file) as f:
- lines = f.readlines()
- num_lines = len(lines)
- num_job_lines = num_lines // nj
- start = 0
- for i in range(nj):
- end = start + num_job_lines
- file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1)))
- with open(file, "w") as f:
- if i == nj - 1:
- f.writelines(lines[start:])
- else:
- f.writelines(lines[start:end])
- start = end
-
- p = Pool(nj)
- for i in range(nj):
- p.apply_async(modelscope_infer_after_finetune_core,
- args=(model_dir, output_dir, split_dir, njob, str(i + 1)))
- p.close()
- p.join()
-
- # combine decoding results
- best_recog_path = os.path.join(output_dir, "1best_recog")
- os.mkdir(best_recog_path)
- files = ["text", "token", "score"]
- for file in files:
- with open(os.path.join(best_recog_path, file), "w") as f:
- for i in range(nj):
- job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file)
- with open(job_file) as f_job:
- lines = f_job.readlines()
- f.writelines(lines)
-
- # If text exists, compute CER
- text_in = os.path.join(params["data_dir"], "text")
- if os.path.exists(text_in):
- text_proc_file = os.path.join(best_recog_path, "token")
- compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
-
-if __name__ == '__main__':
- params = {}
- params["modelscope_model_name"] = "damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline"
- params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
- params["model_dir"] = "./checkpoint"
- params["output_dir"] = "./results"
- params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "20epoch.pb"
- params["ngpu"] = 1
- params["njob"] = 1
- modelscope_infer_after_finetune(params)
-
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/README.md
deleted file mode 100644
index c68a8cd4f..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained Paraformer-large Model
-
-### Finetune
-
-- Modify finetune training related parameters in `finetune.py`
- - output_dir: # result dir
- - data_dir: # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
- - batch_bins: # batch size
- - max_epoch: # number of training epoch
- - lr: # learning rate
-
-- Then you can run the pipeline to finetune with:
-```python
- python finetune.py
-```
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.py`
- - audio_in: # support wav, url, bytes, and parsed audio format.
- - output_dir: # If the input format is wav.scp, it needs to be set.
-
-- Then you can run the pipeline to infer with:
-```python
- python infer.py
-```
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/finetune.py
deleted file mode 100644
index b18296ecc..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/finetune.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import os
-from modelscope.metainfo import Trainers
-from modelscope.trainers import build_trainer
-from funasr.datasets.ms_dataset import MsDataset
-
-
-def modelscope_finetune(params):
- if not os.path.exists(params.output_dir):
- os.makedirs(params.output_dir, exist_ok=True)
- # dataset split ["train", "validation"]
- ds_dict = MsDataset.load(params.data_path)
- kwargs = dict(
- model=params.model,
- model_revision=params.model_revision,
- data_dir=ds_dict,
- dataset_type=params.dataset_type,
- work_dir=params.output_dir,
- batch_bins=params.batch_bins,
- max_epoch=params.max_epoch,
- lr=params.lr)
- trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
- trainer.train()
-
-
-if __name__ == '__main__':
- from funasr.utils.modelscope_param import modelscope_args
- params = modelscope_args(model="damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline", data_path="./data")
- params.output_dir = "./checkpoint" # m模型保存路径
- params.data_path = "./example_data/" # 数据路径
- params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large
- params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒,
- params.max_epoch = 20 # 最大训练轮数
- params.lr = 0.00005 # 设置学习率
-
- modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/infer.py
deleted file mode 100644
index 8ec42885d..000000000
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/infer.py
+++ /dev/null
@@ -1,14 +0,0 @@
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == '__main__':
- audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
- output_dir = None
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline",
- output_dir=output_dir,
- )
- rec_result = inference_pipeline(audio_in=audio_in)
- print(rec_result)
-
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py
index 20994d39c..45b5e331e 100644
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py
@@ -1,12 +1,12 @@
-##################text.scp文件路径###################
-inputs = "./egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt"
+##################text.scp###################
+# inputs = "./egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt"
-##################text二进制数据#####################
+##################text#####################
#inputs = "我们都是木头人不会讲话不会动"
-##################text文件url#######################
-#inputs = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt"
+##################text file url#######################
+inputs = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt"
from modelscope.pipelines import pipeline
diff --git a/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer.py b/egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/demo.py
similarity index 100%
rename from egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/infer.py
rename to egs_modelscope/speaker_verification/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/demo.py
diff --git a/egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/infer.py b/egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/demo.py
similarity index 100%
rename from egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/infer.py
rename to egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/demo.py
diff --git a/funasr/bin/asr_infer.py b/funasr/bin/asr_infer.py
index dce9ee009..488be16ee 100644
--- a/funasr/bin/asr_infer.py
+++ b/funasr/bin/asr_infer.py
@@ -46,11 +46,12 @@ from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaform
from funasr.models.e2e_asr_contextual_paraformer import NeatContextualParaformer
from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
-from funasr.bin.tp_inference import SpeechText2Timestamp
+from funasr.bin.tp_infer import Speech2Timestamp
from funasr.bin.vad_inference import Speech2VadSegment
-from funasr.bin.punctuation_infer import Text2Punc
+from funasr.bin.punc_infer import Text2Punc
from funasr.utils.vad_utils import slice_padding_fbank
from funasr.tasks.vad import VADTask
+
from funasr.utils.timestamp_tools import time_stamp_sentence, ts_prediction_lfr6_standard
@@ -616,6 +617,7 @@ class Speech2TextParaformerOnline:
# 1. Build ASR model
scorers = {}
+ from funasr.tasks.asr import ASRTaskParaformer as ASRTask
asr_model, asr_train_args = ASRTask.build_model_from_file(
asr_train_config, asr_model_file, cmvn_file, device
)
diff --git a/funasr/bin/asr_inference.py b/funasr/bin/asr_inference.py
deleted file mode 100644
index f70382bf1..000000000
--- a/funasr/bin/asr_inference.py
+++ /dev/null
@@ -1,592 +0,0 @@
-#!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-
-import argparse
-import logging
-import sys
-from pathlib import Path
-from typing import Any
-from typing import List
-from typing import Optional
-from typing import Sequence
-from typing import Tuple
-from typing import Union
-from typing import Dict
-
-import numpy as np
-import torch
-from typeguard import check_argument_types
-from typeguard import check_return_type
-
-from funasr.fileio.datadir_writer import DatadirWriter
-from funasr.modules.beam_search.batch_beam_search import BatchBeamSearch
-from funasr.modules.beam_search.batch_beam_search_online_sim import BatchBeamSearchOnlineSim
-from funasr.modules.beam_search.beam_search import BeamSearch
-from funasr.modules.beam_search.beam_search import Hypothesis
-from funasr.modules.scorers.ctc import CTCPrefixScorer
-from funasr.modules.scorers.length_bonus import LengthBonus
-from funasr.modules.scorers.scorer_interface import BatchScorerInterface
-from funasr.modules.subsampling import TooShortUttError
-from funasr.tasks.asr import ASRTask
-from funasr.tasks.lm import LMTask
-from funasr.text.build_tokenizer import build_tokenizer
-from funasr.text.token_id_converter import TokenIDConverter
-from funasr.torch_utils.device_funcs import to_device
-from funasr.torch_utils.set_all_random_seed import set_all_random_seed
-from funasr.utils import config_argparse
-from funasr.utils.cli_utils import get_commandline_args
-from funasr.utils.types import str2bool
-from funasr.utils.types import str2triple_str
-from funasr.utils.types import str_or_none
-from funasr.utils import asr_utils, wav_utils, postprocess_utils
-from funasr.models.frontend.wav_frontend import WavFrontend
-from funasr.tasks.asr import frontend_choices
-
-
-header_colors = '\033[95m'
-end_colors = '\033[0m'
-
-
-class Speech2Text:
- """Speech2Text class
-
- Examples:
- >>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
- >>> audio, rate = soundfile.read("speech.wav")
- >>> speech2text(audio)
- [(text, token, token_int, hypothesis object), ...]
-
- """
-
- def __init__(
- self,
- asr_train_config: Union[Path, str] = None,
- asr_model_file: Union[Path, str] = None,
- cmvn_file: Union[Path, str] = None,
- lm_train_config: Union[Path, str] = None,
- lm_file: Union[Path, str] = None,
- token_type: str = None,
- bpemodel: str = None,
- device: str = "cpu",
- maxlenratio: float = 0.0,
- minlenratio: float = 0.0,
- batch_size: int = 1,
- dtype: str = "float32",
- beam_size: int = 20,
- ctc_weight: float = 0.5,
- lm_weight: float = 1.0,
- ngram_weight: float = 0.9,
- penalty: float = 0.0,
- nbest: int = 1,
- streaming: bool = False,
- frontend_conf: dict = None,
- **kwargs,
- ):
- assert check_argument_types()
-
- # 1. Build ASR model
- scorers = {}
- asr_model, asr_train_args = ASRTask.build_model_from_file(
- asr_train_config, asr_model_file, cmvn_file, device
- )
- frontend = None
- if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
- if asr_train_args.frontend=='wav_frontend':
- frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
- else:
- frontend_class=frontend_choices.get_class(asr_train_args.frontend)
- frontend = frontend_class(**asr_train_args.frontend_conf).eval()
-
- logging.info("asr_model: {}".format(asr_model))
- logging.info("asr_train_args: {}".format(asr_train_args))
- asr_model.to(dtype=getattr(torch, dtype)).eval()
-
- decoder = asr_model.decoder
-
- ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
- token_list = asr_model.token_list
- scorers.update(
- decoder=decoder,
- ctc=ctc,
- length_bonus=LengthBonus(len(token_list)),
- )
-
- # 2. Build Language model
- if lm_train_config is not None:
- lm, lm_train_args = LMTask.build_model_from_file(
- lm_train_config, lm_file, None, device
- )
- scorers["lm"] = lm.lm
-
- # 3. Build ngram model
- # ngram is not supported now
- ngram = None
- scorers["ngram"] = ngram
-
- # 4. Build BeamSearch object
- # transducer is not supported now
- beam_search_transducer = None
-
- weights = dict(
- decoder=1.0 - ctc_weight,
- ctc=ctc_weight,
- lm=lm_weight,
- ngram=ngram_weight,
- length_bonus=penalty,
- )
- beam_search = BeamSearch(
- beam_size=beam_size,
- weights=weights,
- scorers=scorers,
- sos=asr_model.sos,
- eos=asr_model.eos,
- vocab_size=len(token_list),
- token_list=token_list,
- pre_beam_score_key=None if ctc_weight == 1.0 else "full",
- )
-
- # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
- if token_type is None:
- token_type = asr_train_args.token_type
- if bpemodel is None:
- bpemodel = asr_train_args.bpemodel
-
- if token_type is None:
- tokenizer = None
- elif token_type == "bpe":
- if bpemodel is not None:
- tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
- else:
- tokenizer = None
- else:
- tokenizer = build_tokenizer(token_type=token_type)
- converter = TokenIDConverter(token_list=token_list)
- logging.info(f"Text tokenizer: {tokenizer}")
-
- self.asr_model = asr_model
- self.asr_train_args = asr_train_args
- self.converter = converter
- self.tokenizer = tokenizer
- self.beam_search = beam_search
- self.beam_search_transducer = beam_search_transducer
- self.maxlenratio = maxlenratio
- self.minlenratio = minlenratio
- self.device = device
- self.dtype = dtype
- self.nbest = nbest
- self.frontend = frontend
-
- @torch.no_grad()
- def __call__(
- self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
- ) -> List[
- Tuple[
- Optional[str],
- List[str],
- List[int],
- Union[Hypothesis],
- ]
- ]:
- """Inference
-
- Args:
- speech: Input speech data
- Returns:
- text, token, token_int, hyp
-
- """
- assert check_argument_types()
-
- # Input as audio signal
- if isinstance(speech, np.ndarray):
- speech = torch.tensor(speech)
-
- if self.frontend is not None:
- feats, feats_len = self.frontend.forward(speech, speech_lengths)
- feats = to_device(feats, device=self.device)
- feats_len = feats_len.int()
- self.asr_model.frontend = None
- else:
- feats = speech
- feats_len = speech_lengths
- lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
- batch = {"speech": feats, "speech_lengths": feats_len}
-
- # a. To device
- batch = to_device(batch, device=self.device)
-
- # b. Forward Encoder
- enc, _ = self.asr_model.encode(**batch)
- if isinstance(enc, tuple):
- enc = enc[0]
- assert len(enc) == 1, len(enc)
-
- # c. Passed the encoder result and the beam search
- nbest_hyps = self.beam_search(
- x=enc[0], maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
- )
-
- nbest_hyps = nbest_hyps[: self.nbest]
-
- results = []
- for hyp in nbest_hyps:
- assert isinstance(hyp, (Hypothesis)), type(hyp)
-
- # remove sos/eos and get results
- last_pos = -1
- if isinstance(hyp.yseq, list):
- token_int = hyp.yseq[1:last_pos]
- else:
- token_int = hyp.yseq[1:last_pos].tolist()
-
- # remove blank symbol id, which is assumed to be 0
- token_int = list(filter(lambda x: x != 0, token_int))
-
- # Change integer-ids to tokens
- token = self.converter.ids2tokens(token_int)
-
- if self.tokenizer is not None:
- text = self.tokenizer.tokens2text(token)
- else:
- text = None
- results.append((text, token, token_int, hyp))
-
- assert check_return_type(results)
- return results
-
-
-
-def inference_modelscope(
- maxlenratio: float,
- minlenratio: float,
- batch_size: int,
- beam_size: int,
- ngpu: int,
- ctc_weight: float,
- lm_weight: float,
- penalty: float,
- log_level: Union[int, str],
- # data_path_and_name_and_type,
- asr_train_config: Optional[str],
- asr_model_file: Optional[str],
- cmvn_file: Optional[str] = None,
- lm_train_config: Optional[str] = None,
- lm_file: Optional[str] = None,
- token_type: Optional[str] = None,
- key_file: Optional[str] = None,
- word_lm_train_config: Optional[str] = None,
- bpemodel: Optional[str] = None,
- allow_variable_data_keys: bool = False,
- streaming: bool = False,
- output_dir: Optional[str] = None,
- dtype: str = "float32",
- seed: int = 0,
- ngram_weight: float = 0.9,
- nbest: int = 1,
- num_workers: int = 1,
- mc: bool = False,
- param_dict: dict = None,
- **kwargs,
-):
- assert check_argument_types()
- ncpu = kwargs.get("ncpu", 1)
- torch.set_num_threads(ncpu)
- if batch_size > 1:
- raise NotImplementedError("batch decoding is not implemented")
- if word_lm_train_config is not None:
- raise NotImplementedError("Word LM is not implemented")
- if ngpu > 1:
- raise NotImplementedError("only single GPU decoding is supported")
-
- for handler in logging.root.handlers[:]:
- logging.root.removeHandler(handler)
-
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
-
- if ngpu >= 1 and torch.cuda.is_available():
- device = "cuda"
- else:
- device = "cpu"
-
- # 1. Set random-seed
- set_all_random_seed(seed)
-
- # 2. Build speech2text
- speech2text_kwargs = dict(
- asr_train_config=asr_train_config,
- asr_model_file=asr_model_file,
- cmvn_file=cmvn_file,
- lm_train_config=lm_train_config,
- lm_file=lm_file,
- token_type=token_type,
- bpemodel=bpemodel,
- device=device,
- maxlenratio=maxlenratio,
- minlenratio=minlenratio,
- dtype=dtype,
- beam_size=beam_size,
- ctc_weight=ctc_weight,
- lm_weight=lm_weight,
- ngram_weight=ngram_weight,
- penalty=penalty,
- nbest=nbest,
- streaming=streaming,
- )
- logging.info("speech2text_kwargs: {}".format(speech2text_kwargs))
- speech2text = Speech2Text(**speech2text_kwargs)
-
- def _forward(data_path_and_name_and_type,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- output_dir_v2: Optional[str] = None,
- fs: dict = None,
- param_dict: dict = None,
- **kwargs,
- ):
- # 3. Build data-iterator
- if data_path_and_name_and_type is None and raw_inputs is not None:
- if isinstance(raw_inputs, torch.Tensor):
- raw_inputs = raw_inputs.numpy()
- data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
- loader = ASRTask.build_streaming_iterator(
- data_path_and_name_and_type,
- dtype=dtype,
- fs=fs,
- mc=mc,
- batch_size=batch_size,
- key_file=key_file,
- num_workers=num_workers,
- preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
- collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
- allow_variable_data_keys=allow_variable_data_keys,
- inference=True,
- )
-
- finish_count = 0
- file_count = 1
- # 7 .Start for-loop
- # FIXME(kamo): The output format should be discussed about
- asr_result_list = []
- output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
- if output_path is not None:
- writer = DatadirWriter(output_path)
- else:
- writer = None
-
- for keys, batch in loader:
- assert isinstance(batch, dict), type(batch)
- assert all(isinstance(s, str) for s in keys), keys
- _bs = len(next(iter(batch.values())))
- assert len(keys) == _bs, f"{len(keys)} != {_bs}"
- # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
-
- # N-best list of (text, token, token_int, hyp_object)
- try:
- results = speech2text(**batch)
- except TooShortUttError as e:
- logging.warning(f"Utterance {keys} {e}")
- hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
- results = [[" ", ["sil"], [2], hyp]] * nbest
-
- # Only supporting batch_size==1
- key = keys[0]
- for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
- # Create a directory: outdir/{n}best_recog
- if writer is not None:
- ibest_writer = writer[f"{n}best_recog"]
-
- # Write the result to each file
- ibest_writer["token"][key] = " ".join(token)
- ibest_writer["token_int"][key] = " ".join(map(str, token_int))
- ibest_writer["score"][key] = str(hyp.score)
-
- if text is not None:
- text_postprocessed, _ = postprocess_utils.sentence_postprocess(token)
- item = {'key': key, 'value': text_postprocessed}
- asr_result_list.append(item)
- finish_count += 1
- asr_utils.print_progress(finish_count / file_count)
- if writer is not None:
- ibest_writer["text"][key] = text
-
- logging.info("uttid: {}".format(key))
- logging.info("text predictions: {}\n".format(text))
- return asr_result_list
-
- return _forward
-
-def get_parser():
- parser = config_argparse.ArgumentParser(
- description="ASR Decoding",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- )
-
- # Note(kamo): Use '_' instead of '-' as separator.
- # '-' is confusing if written in yaml.
- parser.add_argument(
- "--log_level",
- type=lambda x: x.upper(),
- default="INFO",
- choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
- help="The verbose level of logging",
- )
-
- parser.add_argument("--output_dir", type=str, required=True)
- parser.add_argument(
- "--ngpu",
- type=int,
- default=0,
- help="The number of gpus. 0 indicates CPU mode",
- )
- parser.add_argument(
- "--gpuid_list",
- type=str,
- default="",
- help="The visible gpus",
- )
- parser.add_argument("--seed", type=int, default=0, help="Random seed")
- parser.add_argument(
- "--dtype",
- default="float32",
- choices=["float16", "float32", "float64"],
- help="Data type",
- )
- parser.add_argument(
- "--num_workers",
- type=int,
- default=1,
- help="The number of workers used for DataLoader",
- )
-
- group = parser.add_argument_group("Input data related")
- group.add_argument(
- "--data_path_and_name_and_type",
- type=str2triple_str,
- required=False,
- action="append",
- )
- group.add_argument("--raw_inputs", type=list, default=None)
- # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
- group.add_argument("--key_file", type=str_or_none)
- group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
-
- group = parser.add_argument_group("The model configuration related")
- group.add_argument(
- "--asr_train_config",
- type=str,
- help="ASR training configuration",
- )
- group.add_argument(
- "--asr_model_file",
- type=str,
- help="ASR model parameter file",
- )
- group.add_argument(
- "--cmvn_file",
- type=str,
- help="Global cmvn file",
- )
- group.add_argument(
- "--lm_train_config",
- type=str,
- help="LM training configuration",
- )
- group.add_argument(
- "--lm_file",
- type=str,
- help="LM parameter file",
- )
- group.add_argument(
- "--word_lm_train_config",
- type=str,
- help="Word LM training configuration",
- )
- group.add_argument(
- "--word_lm_file",
- type=str,
- help="Word LM parameter file",
- )
- group.add_argument(
- "--ngram_file",
- type=str,
- help="N-gram parameter file",
- )
- group.add_argument(
- "--model_tag",
- type=str,
- help="Pretrained model tag. If specify this option, *_train_config and "
- "*_file will be overwritten",
- )
-
- group = parser.add_argument_group("Beam-search related")
- group.add_argument(
- "--batch_size",
- type=int,
- default=1,
- help="The batch size for inference",
- )
- group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
- group.add_argument("--beam_size", type=int, default=20, help="Beam size")
- group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
- group.add_argument(
- "--maxlenratio",
- type=float,
- default=0.0,
- help="Input length ratio to obtain max output length. "
- "If maxlenratio=0.0 (default), it uses a end-detect "
- "function "
- "to automatically find maximum hypothesis lengths."
- "If maxlenratio<0.0, its absolute value is interpreted"
- "as a constant max output length",
- )
- group.add_argument(
- "--minlenratio",
- type=float,
- default=0.0,
- help="Input length ratio to obtain min output length",
- )
- group.add_argument(
- "--ctc_weight",
- type=float,
- default=0.5,
- help="CTC weight in joint decoding",
- )
- group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
- group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
- group.add_argument("--streaming", type=str2bool, default=False)
-
- group = parser.add_argument_group("Text converter related")
- group.add_argument(
- "--token_type",
- type=str_or_none,
- default=None,
- choices=["char", "bpe", None],
- help="The token type for ASR model. "
- "If not given, refers from the training args",
- )
- group.add_argument(
- "--bpemodel",
- type=str_or_none,
- default=None,
- help="The model path of sentencepiece. "
- "If not given, refers from the training args",
- )
-
- return parser
-
-
-def main(cmd=None):
- print(get_commandline_args(), file=sys.stderr)
- parser = get_parser()
- args = parser.parse_args(cmd)
- kwargs = vars(args)
- kwargs.pop("config", None)
- inference(**kwargs)
-
-
-if __name__ == "__main__":
- main()
\ No newline at end of file
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index 6ad17f0c6..18700321c 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -852,7 +852,7 @@ def inference_uniasr(
decoding_ind=decoding_ind,
decoding_mode=decoding_mode,
)
- speech2text = Speech2Text(**speech2text_kwargs)
+ speech2text = Speech2TextUniASR(**speech2text_kwargs)
def _forward(data_path_and_name_and_type,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
diff --git a/funasr/bin/asr_inference_mfcca.py b/funasr/bin/asr_inference_mfcca.py
deleted file mode 100644
index e83286958..000000000
--- a/funasr/bin/asr_inference_mfcca.py
+++ /dev/null
@@ -1,767 +0,0 @@
-#!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-
-import argparse
-import logging
-import sys
-from pathlib import Path
-from typing import Any
-from typing import List
-from typing import Optional
-from typing import Sequence
-from typing import Tuple
-from typing import Union
-from typing import Dict
-
-import numpy as np
-import torch
-from typeguard import check_argument_types
-from typeguard import check_return_type
-
-from funasr.fileio.datadir_writer import DatadirWriter
-from funasr.modules.beam_search.batch_beam_search import BatchBeamSearch
-from funasr.modules.beam_search.beam_search import BeamSearch
-from funasr.modules.beam_search.beam_search import Hypothesis
-from funasr.modules.scorers.ctc import CTCPrefixScorer
-from funasr.modules.scorers.length_bonus import LengthBonus
-from funasr.modules.scorers.scorer_interface import BatchScorerInterface
-from funasr.modules.subsampling import TooShortUttError
-from funasr.tasks.asr import ASRTaskMFCCA as ASRTask
-from funasr.tasks.lm import LMTask
-from funasr.text.build_tokenizer import build_tokenizer
-from funasr.text.token_id_converter import TokenIDConverter
-from funasr.torch_utils.device_funcs import to_device
-from funasr.torch_utils.set_all_random_seed import set_all_random_seed
-from funasr.utils import config_argparse
-from funasr.utils.cli_utils import get_commandline_args
-from funasr.utils.types import str2bool
-from funasr.utils.types import str2triple_str
-from funasr.utils.types import str_or_none
-from funasr.utils import asr_utils, wav_utils, postprocess_utils
-import pdb
-
-
-global_asr_language: str = 'zh-cn'
-global_sample_rate: Union[int, Dict[Any, int]] = {
- 'audio_fs': 16000,
- 'model_fs': 16000
-}
-
-class Speech2Text:
- """Speech2Text class
-
- Examples:
- >>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
- >>> audio, rate = soundfile.read("speech.wav")
- >>> speech2text(audio)
- [(text, token, token_int, hypothesis object), ...]
-
- """
-
- def __init__(
- self,
- asr_train_config: Union[Path, str] = None,
- asr_model_file: Union[Path, str] = None,
- cmvn_file: Union[Path, str] = None,
- lm_train_config: Union[Path, str] = None,
- lm_file: Union[Path, str] = None,
- token_type: str = None,
- bpemodel: str = None,
- device: str = "cpu",
- maxlenratio: float = 0.0,
- minlenratio: float = 0.0,
- batch_size: int = 1,
- dtype: str = "float32",
- beam_size: int = 20,
- ctc_weight: float = 0.5,
- lm_weight: float = 1.0,
- ngram_weight: float = 0.9,
- penalty: float = 0.0,
- nbest: int = 1,
- streaming: bool = False,
- **kwargs,
- ):
- assert check_argument_types()
-
- # 1. Build ASR model
- scorers = {}
- asr_model, asr_train_args = ASRTask.build_model_from_file(
- asr_train_config, asr_model_file, cmvn_file, device
- )
-
- logging.info("asr_model: {}".format(asr_model))
- logging.info("asr_train_args: {}".format(asr_train_args))
- asr_model.to(dtype=getattr(torch, dtype)).eval()
-
- decoder = asr_model.decoder
-
- ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
- token_list = asr_model.token_list
- scorers.update(
- decoder=decoder,
- ctc=ctc,
- length_bonus=LengthBonus(len(token_list)),
- )
-
- # 2. Build Language model
- if lm_train_config is not None:
- lm, lm_train_args = LMTask.build_model_from_file(
- lm_train_config, lm_file, device
- )
- lm.to(device)
- scorers["lm"] = lm.lm
- # 3. Build ngram model
- # ngram is not supported now
- ngram = None
- scorers["ngram"] = ngram
-
- # 4. Build BeamSearch object
- # transducer is not supported now
- beam_search_transducer = None
-
- weights = dict(
- decoder=1.0 - ctc_weight,
- ctc=ctc_weight,
- lm=lm_weight,
- ngram=ngram_weight,
- length_bonus=penalty,
- )
- beam_search = BeamSearch(
- beam_size=beam_size,
- weights=weights,
- scorers=scorers,
- sos=asr_model.sos,
- eos=asr_model.eos,
- vocab_size=len(token_list),
- token_list=token_list,
- pre_beam_score_key=None if ctc_weight == 1.0 else "full",
- )
- #beam_search.__class__ = BatchBeamSearch
- # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
- if token_type is None:
- token_type = asr_train_args.token_type
- if bpemodel is None:
- bpemodel = asr_train_args.bpemodel
-
- if token_type is None:
- tokenizer = None
- elif token_type == "bpe":
- if bpemodel is not None:
- tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
- else:
- tokenizer = None
- else:
- tokenizer = build_tokenizer(token_type=token_type)
- converter = TokenIDConverter(token_list=token_list)
- logging.info(f"Text tokenizer: {tokenizer}")
-
- self.asr_model = asr_model
- self.asr_train_args = asr_train_args
- self.converter = converter
- self.tokenizer = tokenizer
- self.beam_search = beam_search
- self.beam_search_transducer = beam_search_transducer
- self.maxlenratio = maxlenratio
- self.minlenratio = minlenratio
- self.device = device
- self.dtype = dtype
- self.nbest = nbest
-
- @torch.no_grad()
- def __call__(
- self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
- ) -> List[
- Tuple[
- Optional[str],
- List[str],
- List[int],
- Union[Hypothesis],
- ]
- ]:
- """Inference
-
- Args:
- speech: Input speech data
- Returns:
- text, token, token_int, hyp
-
- """
- assert check_argument_types()
- # Input as audio signal
- if isinstance(speech, np.ndarray):
- speech = torch.tensor(speech)
- if(speech.dim()==3):
- speech = torch.squeeze(speech, 2)
- #speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
- speech = speech.to(getattr(torch, self.dtype))
- # lenghts: (1,)
- lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
- batch = {"speech": speech, "speech_lengths": lengths}
-
- # a. To device
- batch = to_device(batch, device=self.device)
-
- # b. Forward Encoder
- enc, _ = self.asr_model.encode(**batch)
-
- assert len(enc) == 1, len(enc)
-
- # c. Passed the encoder result and the beam search
- nbest_hyps = self.beam_search(
- x=enc[0], maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
- )
-
- nbest_hyps = nbest_hyps[: self.nbest]
-
- results = []
- for hyp in nbest_hyps:
- assert isinstance(hyp, (Hypothesis)), type(hyp)
-
- # remove sos/eos and get results
- last_pos = -1
- if isinstance(hyp.yseq, list):
- token_int = hyp.yseq[1:last_pos]
- else:
- token_int = hyp.yseq[1:last_pos].tolist()
-
- # remove blank symbol id, which is assumed to be 0
- token_int = list(filter(lambda x: x != 0, token_int))
-
- # Change integer-ids to tokens
- token = self.converter.ids2tokens(token_int)
-
- if self.tokenizer is not None:
- text = self.tokenizer.tokens2text(token)
- else:
- text = None
- results.append((text, token, token_int, hyp))
-
- assert check_return_type(results)
- return results
-
-
-# def inference(
-# maxlenratio: float,
-# minlenratio: float,
-# batch_size: int,
-# beam_size: int,
-# ngpu: int,
-# ctc_weight: float,
-# lm_weight: float,
-# penalty: float,
-# log_level: Union[int, str],
-# data_path_and_name_and_type,
-# asr_train_config: Optional[str],
-# asr_model_file: Optional[str],
-# cmvn_file: Optional[str] = None,
-# lm_train_config: Optional[str] = None,
-# lm_file: Optional[str] = None,
-# token_type: Optional[str] = None,
-# key_file: Optional[str] = None,
-# word_lm_train_config: Optional[str] = None,
-# bpemodel: Optional[str] = None,
-# allow_variable_data_keys: bool = False,
-# streaming: bool = False,
-# output_dir: Optional[str] = None,
-# dtype: str = "float32",
-# seed: int = 0,
-# ngram_weight: float = 0.9,
-# nbest: int = 1,
-# num_workers: int = 1,
-# **kwargs,
-# ):
-# assert check_argument_types()
-# if batch_size > 1:
-# raise NotImplementedError("batch decoding is not implemented")
-# if word_lm_train_config is not None:
-# raise NotImplementedError("Word LM is not implemented")
-# if ngpu > 1:
-# raise NotImplementedError("only single GPU decoding is supported")
-#
-# logging.basicConfig(
-# level=log_level,
-# format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
-# )
-#
-# if ngpu >= 1 and torch.cuda.is_available():
-# device = "cuda"
-# else:
-# device = "cpu"
-#
-# # 1. Set random-seed
-# set_all_random_seed(seed)
-#
-# # 2. Build speech2text
-# speech2text_kwargs = dict(
-# asr_train_config=asr_train_config,
-# asr_model_file=asr_model_file,
-# cmvn_file=cmvn_file,
-# lm_train_config=lm_train_config,
-# lm_file=lm_file,
-# token_type=token_type,
-# bpemodel=bpemodel,
-# device=device,
-# maxlenratio=maxlenratio,
-# minlenratio=minlenratio,
-# dtype=dtype,
-# beam_size=beam_size,
-# ctc_weight=ctc_weight,
-# lm_weight=lm_weight,
-# ngram_weight=ngram_weight,
-# penalty=penalty,
-# nbest=nbest,
-# streaming=streaming,
-# )
-# logging.info("speech2text_kwargs: {}".format(speech2text_kwargs))
-# speech2text = Speech2Text(**speech2text_kwargs)
-#
-# # 3. Build data-iterator
-# loader = ASRTask.build_streaming_iterator(
-# data_path_and_name_and_type,
-# dtype=dtype,
-# batch_size=batch_size,
-# key_file=key_file,
-# num_workers=num_workers,
-# preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
-# collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
-# allow_variable_data_keys=allow_variable_data_keys,
-# inference=True,
-# )
-#
-# finish_count = 0
-# file_count = 1
-# # 7 .Start for-loop
-# # FIXME(kamo): The output format should be discussed about
-# asr_result_list = []
-# if output_dir is not None:
-# writer = DatadirWriter(output_dir)
-# else:
-# writer = None
-#
-# for keys, batch in loader:
-# assert isinstance(batch, dict), type(batch)
-# assert all(isinstance(s, str) for s in keys), keys
-# _bs = len(next(iter(batch.values())))
-# assert len(keys) == _bs, f"{len(keys)} != {_bs}"
-# #batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
-#
-# # N-best list of (text, token, token_int, hyp_object)
-# try:
-# results = speech2text(**batch)
-# except TooShortUttError as e:
-# logging.warning(f"Utterance {keys} {e}")
-# hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-# results = [[" ", [""], [2], hyp]] * nbest
-#
-# # Only supporting batch_size==1
-# key = keys[0]
-# for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
-# # Create a directory: outdir/{n}best_recog
-# if writer is not None:
-# ibest_writer = writer[f"{n}best_recog"]
-#
-# # Write the result to each file
-# ibest_writer["token"][key] = " ".join(token)
-# ibest_writer["token_int"][key] = " ".join(map(str, token_int))
-# ibest_writer["score"][key] = str(hyp.score)
-#
-# if text is not None:
-# text_postprocessed = postprocess_utils.sentence_postprocess(token)
-# item = {'key': key, 'value': text_postprocessed}
-# asr_result_list.append(item)
-# finish_count += 1
-# asr_utils.print_progress(finish_count / file_count)
-# if writer is not None:
-# ibest_writer["text"][key] = text
-# return asr_result_list
-
-def inference(
- maxlenratio: float,
- minlenratio: float,
- batch_size: int,
- beam_size: int,
- ngpu: int,
- ctc_weight: float,
- lm_weight: float,
- penalty: float,
- log_level: Union[int, str],
- data_path_and_name_and_type,
- asr_train_config: Optional[str],
- asr_model_file: Optional[str],
- cmvn_file: Optional[str] = None,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- lm_train_config: Optional[str] = None,
- lm_file: Optional[str] = None,
- token_type: Optional[str] = None,
- key_file: Optional[str] = None,
- word_lm_train_config: Optional[str] = None,
- bpemodel: Optional[str] = None,
- allow_variable_data_keys: bool = False,
- streaming: bool = False,
- output_dir: Optional[str] = None,
- dtype: str = "float32",
- seed: int = 0,
- ngram_weight: float = 0.9,
- nbest: int = 1,
- num_workers: int = 1,
- **kwargs,
-):
- inference_pipeline = inference_modelscope(
- maxlenratio=maxlenratio,
- minlenratio=minlenratio,
- batch_size=batch_size,
- beam_size=beam_size,
- ngpu=ngpu,
- ctc_weight=ctc_weight,
- lm_weight=lm_weight,
- penalty=penalty,
- log_level=log_level,
- asr_train_config=asr_train_config,
- asr_model_file=asr_model_file,
- cmvn_file=cmvn_file,
- raw_inputs=raw_inputs,
- lm_train_config=lm_train_config,
- lm_file=lm_file,
- token_type=token_type,
- key_file=key_file,
- word_lm_train_config=word_lm_train_config,
- bpemodel=bpemodel,
- allow_variable_data_keys=allow_variable_data_keys,
- streaming=streaming,
- output_dir=output_dir,
- dtype=dtype,
- seed=seed,
- ngram_weight=ngram_weight,
- nbest=nbest,
- num_workers=num_workers,
- **kwargs,
- )
- return inference_pipeline(data_path_and_name_and_type, raw_inputs)
-
-def inference_modelscope(
- maxlenratio: float,
- minlenratio: float,
- batch_size: int,
- beam_size: int,
- ngpu: int,
- ctc_weight: float,
- lm_weight: float,
- penalty: float,
- log_level: Union[int, str],
- # data_path_and_name_and_type,
- asr_train_config: Optional[str],
- asr_model_file: Optional[str],
- cmvn_file: Optional[str] = None,
- lm_train_config: Optional[str] = None,
- lm_file: Optional[str] = None,
- token_type: Optional[str] = None,
- key_file: Optional[str] = None,
- word_lm_train_config: Optional[str] = None,
- bpemodel: Optional[str] = None,
- allow_variable_data_keys: bool = False,
- streaming: bool = False,
- output_dir: Optional[str] = None,
- dtype: str = "float32",
- seed: int = 0,
- ngram_weight: float = 0.9,
- nbest: int = 1,
- num_workers: int = 1,
- param_dict: dict = None,
- **kwargs,
-):
- assert check_argument_types()
- ncpu = kwargs.get("ncpu", 1)
- torch.set_num_threads(ncpu)
- if batch_size > 1:
- raise NotImplementedError("batch decoding is not implemented")
- if word_lm_train_config is not None:
- raise NotImplementedError("Word LM is not implemented")
- if ngpu > 1:
- raise NotImplementedError("only single GPU decoding is supported")
-
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
-
- if ngpu >= 1 and torch.cuda.is_available():
- device = "cuda"
- else:
- device = "cpu"
-
- # 1. Set random-seed
- set_all_random_seed(seed)
-
- # 2. Build speech2text
- speech2text_kwargs = dict(
- asr_train_config=asr_train_config,
- asr_model_file=asr_model_file,
- cmvn_file=cmvn_file,
- lm_train_config=lm_train_config,
- lm_file=lm_file,
- token_type=token_type,
- bpemodel=bpemodel,
- device=device,
- maxlenratio=maxlenratio,
- minlenratio=minlenratio,
- dtype=dtype,
- beam_size=beam_size,
- ctc_weight=ctc_weight,
- lm_weight=lm_weight,
- ngram_weight=ngram_weight,
- penalty=penalty,
- nbest=nbest,
- streaming=streaming,
- )
- logging.info("speech2text_kwargs: {}".format(speech2text_kwargs))
- speech2text = Speech2Text(**speech2text_kwargs)
-
- def _forward(data_path_and_name_and_type,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- output_dir_v2: Optional[str] = None,
- fs: dict = None,
- param_dict: dict = None,
- **kwargs,
- ):
- # 3. Build data-iterator
- if data_path_and_name_and_type is None and raw_inputs is not None:
- if isinstance(raw_inputs, torch.Tensor):
- raw_inputs = raw_inputs.numpy()
- data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
- loader = ASRTask.build_streaming_iterator(
- data_path_and_name_and_type,
- dtype=dtype,
- batch_size=batch_size,
- fs=fs,
- mc=True,
- key_file=key_file,
- num_workers=num_workers,
- preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
- collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
- allow_variable_data_keys=allow_variable_data_keys,
- inference=True,
- )
-
- finish_count = 0
- file_count = 1
- # 7 .Start for-loop
- # FIXME(kamo): The output format should be discussed about
- asr_result_list = []
- output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
- if output_path is not None:
- writer = DatadirWriter(output_path)
- else:
- writer = None
-
- for keys, batch in loader:
- assert isinstance(batch, dict), type(batch)
- assert all(isinstance(s, str) for s in keys), keys
- _bs = len(next(iter(batch.values())))
- assert len(keys) == _bs, f"{len(keys)} != {_bs}"
- # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
-
- # N-best list of (text, token, token_int, hyp_object)
- try:
- results = speech2text(**batch)
- except TooShortUttError as e:
- logging.warning(f"Utterance {keys} {e}")
- hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
- results = [[" ", [""], [2], hyp]] * nbest
-
- # Only supporting batch_size==1
- key = keys[0]
- for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
- # Create a directory: outdir/{n}best_recog
- if writer is not None:
- ibest_writer = writer[f"{n}best_recog"]
-
- # Write the result to each file
- ibest_writer["token"][key] = " ".join(token)
- # ibest_writer["token_int"][key] = " ".join(map(str, token_int))
- ibest_writer["score"][key] = str(hyp.score)
-
- if text is not None:
- text_postprocessed = postprocess_utils.sentence_postprocess(token)
- item = {'key': key, 'value': text_postprocessed}
- asr_result_list.append(item)
- finish_count += 1
- asr_utils.print_progress(finish_count / file_count)
- if writer is not None:
- ibest_writer["text"][key] = text
- return asr_result_list
-
- return _forward
-
-def get_parser():
- parser = config_argparse.ArgumentParser(
- description="ASR Decoding",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- )
-
- # Note(kamo): Use '_' instead of '-' as separator.
- # '-' is confusing if written in yaml.
- parser.add_argument(
- "--log_level",
- type=lambda x: x.upper(),
- default="INFO",
- choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
- help="The verbose level of logging",
- )
-
- parser.add_argument("--output_dir", type=str, required=True)
- parser.add_argument(
- "--ngpu",
- type=int,
- default=0,
- help="The number of gpus. 0 indicates CPU mode",
- )
- parser.add_argument(
- "--gpuid_list",
- type=str,
- default="",
- help="The visible gpus",
- )
- parser.add_argument("--seed", type=int, default=0, help="Random seed")
- parser.add_argument(
- "--dtype",
- default="float32",
- choices=["float16", "float32", "float64"],
- help="Data type",
- )
- parser.add_argument(
- "--num_workers",
- type=int,
- default=1,
- help="The number of workers used for DataLoader",
- )
-
- group = parser.add_argument_group("Input data related")
- group.add_argument(
- "--data_path_and_name_and_type",
- type=str2triple_str,
- required=False,
- action="append",
- )
- group.add_argument("--raw_inputs", type=list, default=None)
- # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
- group.add_argument("--key_file", type=str_or_none)
- group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
-
- group = parser.add_argument_group("The model configuration related")
- group.add_argument(
- "--asr_train_config",
- type=str,
- help="ASR training configuration",
- )
- group.add_argument(
- "--asr_model_file",
- type=str,
- help="ASR model parameter file",
- )
- group.add_argument(
- "--cmvn_file",
- type=str,
- help="Global cmvn file",
- )
- group.add_argument(
- "--lm_train_config",
- type=str,
- help="LM training configuration",
- )
- group.add_argument(
- "--lm_file",
- type=str,
- help="LM parameter file",
- )
- group.add_argument(
- "--word_lm_train_config",
- type=str,
- help="Word LM training configuration",
- )
- group.add_argument(
- "--word_lm_file",
- type=str,
- help="Word LM parameter file",
- )
- group.add_argument(
- "--ngram_file",
- type=str,
- help="N-gram parameter file",
- )
- group.add_argument(
- "--model_tag",
- type=str,
- help="Pretrained model tag. If specify this option, *_train_config and "
- "*_file will be overwritten",
- )
-
- group = parser.add_argument_group("Beam-search related")
- group.add_argument(
- "--batch_size",
- type=int,
- default=1,
- help="The batch size for inference",
- )
- group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
- group.add_argument("--beam_size", type=int, default=20, help="Beam size")
- group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
- group.add_argument(
- "--maxlenratio",
- type=float,
- default=0.0,
- help="Input length ratio to obtain max output length. "
- "If maxlenratio=0.0 (default), it uses a end-detect "
- "function "
- "to automatically find maximum hypothesis lengths."
- "If maxlenratio<0.0, its absolute value is interpreted"
- "as a constant max output length",
- )
- group.add_argument(
- "--minlenratio",
- type=float,
- default=0.0,
- help="Input length ratio to obtain min output length",
- )
- group.add_argument(
- "--ctc_weight",
- type=float,
- default=0.5,
- help="CTC weight in joint decoding",
- )
- group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
- group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
- group.add_argument("--streaming", type=str2bool, default=False)
-
- group = parser.add_argument_group("Text converter related")
- group.add_argument(
- "--token_type",
- type=str_or_none,
- default=None,
- choices=["char", "bpe", None],
- help="The token type for ASR model. "
- "If not given, refers from the training args",
- )
- group.add_argument(
- "--bpemodel",
- type=str_or_none,
- default=None,
- help="The model path of sentencepiece. "
- "If not given, refers from the training args",
- )
-
- return parser
-
-
-def main(cmd=None):
- print(get_commandline_args(), file=sys.stderr)
- parser = get_parser()
- args = parser.parse_args(cmd)
- kwargs = vars(args)
- kwargs.pop("config", None)
- inference(**kwargs)
-
-
-if __name__ == "__main__":
- main()
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
deleted file mode 100644
index ecdb62abc..000000000
--- a/funasr/bin/asr_inference_paraformer.py
+++ /dev/null
@@ -1,1027 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import logging
-import sys
-import time
-import copy
-import os
-import codecs
-import tempfile
-import requests
-from pathlib import Path
-from typing import Optional
-from typing import Sequence
-from typing import Tuple
-from typing import Union
-from typing import Dict
-from typing import Any
-from typing import List
-
-import numpy as np
-import torch
-from typeguard import check_argument_types
-
-from funasr.fileio.datadir_writer import DatadirWriter
-from funasr.modules.beam_search.beam_search import BeamSearchPara as BeamSearch
-from funasr.modules.beam_search.beam_search import Hypothesis
-from funasr.modules.scorers.ctc import CTCPrefixScorer
-from funasr.modules.scorers.length_bonus import LengthBonus
-from funasr.modules.subsampling import TooShortUttError
-from funasr.tasks.asr import ASRTaskParaformer as ASRTask
-from funasr.tasks.lm import LMTask
-from funasr.text.build_tokenizer import build_tokenizer
-from funasr.text.token_id_converter import TokenIDConverter
-from funasr.torch_utils.device_funcs import to_device
-from funasr.torch_utils.set_all_random_seed import set_all_random_seed
-from funasr.utils import config_argparse
-from funasr.utils.cli_utils import get_commandline_args
-from funasr.utils.types import str2bool
-from funasr.utils.types import str2triple_str
-from funasr.utils.types import str_or_none
-from funasr.utils import asr_utils, wav_utils, postprocess_utils
-from funasr.models.frontend.wav_frontend import WavFrontend
-from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
-from funasr.models.e2e_asr_contextual_paraformer import NeatContextualParaformer
-from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
-from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
-from funasr.bin.tp_inference import SpeechText2Timestamp
-from funasr.bin.vad_inference import Speech2VadSegment
-from funasr.bin.punctuation_infer import Text2Punc
-from funasr.utils.vad_utils import slice_padding_fbank
-from funasr.tasks.vad import VADTask
-from funasr.utils.timestamp_tools import time_stamp_sentence, ts_prediction_lfr6_standard
-
-class Speech2Text:
- """Speech2Text class
-
- Examples:
- >>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
- >>> audio, rate = soundfile.read("speech.wav")
- >>> speech2text(audio)
- [(text, token, token_int, hypothesis object), ...]
-
- """
-
- def __init__(
- self,
- asr_train_config: Union[Path, str] = None,
- asr_model_file: Union[Path, str] = None,
- cmvn_file: Union[Path, str] = None,
- lm_train_config: Union[Path, str] = None,
- lm_file: Union[Path, str] = None,
- token_type: str = None,
- bpemodel: str = None,
- device: str = "cpu",
- maxlenratio: float = 0.0,
- minlenratio: float = 0.0,
- dtype: str = "float32",
- beam_size: int = 20,
- ctc_weight: float = 0.5,
- lm_weight: float = 1.0,
- ngram_weight: float = 0.9,
- penalty: float = 0.0,
- nbest: int = 1,
- frontend_conf: dict = None,
- hotword_list_or_file: str = None,
- **kwargs,
- ):
- assert check_argument_types()
-
- # 1. Build ASR model
- scorers = {}
- asr_model, asr_train_args = ASRTask.build_model_from_file(
- asr_train_config, asr_model_file, cmvn_file, device
- )
- frontend = None
- if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
- frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
-
- logging.info("asr_model: {}".format(asr_model))
- logging.info("asr_train_args: {}".format(asr_train_args))
- asr_model.to(dtype=getattr(torch, dtype)).eval()
-
- if asr_model.ctc != None:
- ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
- scorers.update(
- ctc=ctc
- )
- token_list = asr_model.token_list
- scorers.update(
- length_bonus=LengthBonus(len(token_list)),
- )
-
- # 2. Build Language model
- if lm_train_config is not None:
- lm, lm_train_args = LMTask.build_model_from_file(
- lm_train_config, lm_file, device
- )
- scorers["lm"] = lm.lm
-
- # 3. Build ngram model
- # ngram is not supported now
- ngram = None
- scorers["ngram"] = ngram
-
- # 4. Build BeamSearch object
- # transducer is not supported now
- beam_search_transducer = None
-
- weights = dict(
- decoder=1.0 - ctc_weight,
- ctc=ctc_weight,
- lm=lm_weight,
- ngram=ngram_weight,
- length_bonus=penalty,
- )
- beam_search = BeamSearch(
- beam_size=beam_size,
- weights=weights,
- scorers=scorers,
- sos=asr_model.sos,
- eos=asr_model.eos,
- vocab_size=len(token_list),
- token_list=token_list,
- pre_beam_score_key=None if ctc_weight == 1.0 else "full",
- )
-
- beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
- for scorer in scorers.values():
- if isinstance(scorer, torch.nn.Module):
- scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
-
- logging.info(f"Decoding device={device}, dtype={dtype}")
-
- # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
- if token_type is None:
- token_type = asr_train_args.token_type
- if bpemodel is None:
- bpemodel = asr_train_args.bpemodel
-
- if token_type is None:
- tokenizer = None
- elif token_type == "bpe":
- if bpemodel is not None:
- tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
- else:
- tokenizer = None
- else:
- tokenizer = build_tokenizer(token_type=token_type)
- converter = TokenIDConverter(token_list=token_list)
- logging.info(f"Text tokenizer: {tokenizer}")
-
- self.asr_model = asr_model
- self.asr_train_args = asr_train_args
- self.converter = converter
- self.tokenizer = tokenizer
-
- # 6. [Optional] Build hotword list from str, local file or url
- self.hotword_list = None
- self.hotword_list = self.generate_hotwords_list(hotword_list_or_file)
-
- is_use_lm = lm_weight != 0.0 and lm_file is not None
- if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
- beam_search = None
- self.beam_search = beam_search
- logging.info(f"Beam_search: {self.beam_search}")
- self.beam_search_transducer = beam_search_transducer
- self.maxlenratio = maxlenratio
- self.minlenratio = minlenratio
- self.device = device
- self.dtype = dtype
- self.nbest = nbest
- self.frontend = frontend
- self.encoder_downsampling_factor = 1
- if asr_train_args.encoder == "data2vec_encoder" or asr_train_args.encoder_conf["input_layer"] == "conv2d":
- self.encoder_downsampling_factor = 4
-
- @torch.no_grad()
- def __call__(
- self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
- begin_time: int = 0, end_time: int = None,
- ):
- """Inference
-
- Args:
- speech: Input speech data
- Returns:
- text, token, token_int, hyp
-
- """
- assert check_argument_types()
-
- # Input as audio signal
- if isinstance(speech, np.ndarray):
- speech = torch.tensor(speech)
-
- if self.frontend is not None:
- feats, feats_len = self.frontend.forward(speech, speech_lengths)
- feats = to_device(feats, device=self.device)
- feats_len = feats_len.int()
- self.asr_model.frontend = None
- else:
- feats = speech
- feats_len = speech_lengths
- lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
- batch = {"speech": feats, "speech_lengths": feats_len}
-
- # a. To device
- batch = to_device(batch, device=self.device)
-
- # b. Forward Encoder
- enc, enc_len = self.asr_model.encode(**batch)
- if isinstance(enc, tuple):
- enc = enc[0]
- # assert len(enc) == 1, len(enc)
- enc_len_batch_total = torch.sum(enc_len).item() * self.encoder_downsampling_factor
-
- predictor_outs = self.asr_model.calc_predictor(enc, enc_len)
- pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
- predictor_outs[2], predictor_outs[3]
- pre_token_length = pre_token_length.round().long()
- if torch.max(pre_token_length) < 1:
- return []
- if not isinstance(self.asr_model, ContextualParaformer) and not isinstance(self.asr_model, NeatContextualParaformer):
- if self.hotword_list:
- logging.warning("Hotword is given but asr model is not a ContextualParaformer.")
- decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
- decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
- else:
- decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length, hw_list=self.hotword_list)
- decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
-
- if isinstance(self.asr_model, BiCifParaformer):
- _, _, us_alphas, us_peaks = self.asr_model.calc_predictor_timestamp(enc, enc_len,
- pre_token_length) # test no bias cif2
-
- results = []
- b, n, d = decoder_out.size()
- for i in range(b):
- x = enc[i, :enc_len[i], :]
- am_scores = decoder_out[i, :pre_token_length[i], :]
- if self.beam_search is not None:
- nbest_hyps = self.beam_search(
- x=x, am_scores=am_scores, maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
- )
-
- nbest_hyps = nbest_hyps[: self.nbest]
- else:
- yseq = am_scores.argmax(dim=-1)
- score = am_scores.max(dim=-1)[0]
- score = torch.sum(score, dim=-1)
- # pad with mask tokens to ensure compatibility with sos/eos tokens
- yseq = torch.tensor(
- [self.asr_model.sos] + yseq.tolist() + [self.asr_model.eos], device=yseq.device
- )
- nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
-
- for hyp in nbest_hyps:
- assert isinstance(hyp, (Hypothesis)), type(hyp)
-
- # remove sos/eos and get results
- last_pos = -1
- if isinstance(hyp.yseq, list):
- token_int = hyp.yseq[1:last_pos]
- else:
- token_int = hyp.yseq[1:last_pos].tolist()
-
- # remove blank symbol id, which is assumed to be 0
- token_int = list(filter(lambda x: x != 0 and x != 2, token_int))
-
- # Change integer-ids to tokens
- token = self.converter.ids2tokens(token_int)
-
- if self.tokenizer is not None:
- text = self.tokenizer.tokens2text(token)
- else:
- text = None
- timestamp = []
- if isinstance(self.asr_model, BiCifParaformer):
- _, timestamp = ts_prediction_lfr6_standard(us_alphas[i][:enc_len[i]*3],
- us_peaks[i][:enc_len[i]*3],
- copy.copy(token),
- vad_offset=begin_time)
- results.append((text, token, token_int, hyp, timestamp, enc_len_batch_total, lfr_factor))
-
-
- # assert check_return_type(results)
- return results
-
- def generate_hotwords_list(self, hotword_list_or_file):
- # for None
- if hotword_list_or_file is None:
- hotword_list = None
- # for local txt inputs
- elif os.path.exists(hotword_list_or_file) and hotword_list_or_file.endswith('.txt'):
- logging.info("Attempting to parse hotwords from local txt...")
- hotword_list = []
- hotword_str_list = []
- with codecs.open(hotword_list_or_file, 'r') as fin:
- for line in fin.readlines():
- hw = line.strip()
- hotword_str_list.append(hw)
- hotword_list.append(self.converter.tokens2ids([i for i in hw]))
- hotword_list.append([self.asr_model.sos])
- hotword_str_list.append('')
- logging.info("Initialized hotword list from file: {}, hotword list: {}."
- .format(hotword_list_or_file, hotword_str_list))
- # for url, download and generate txt
- elif hotword_list_or_file.startswith('http'):
- logging.info("Attempting to parse hotwords from url...")
- work_dir = tempfile.TemporaryDirectory().name
- if not os.path.exists(work_dir):
- os.makedirs(work_dir)
- text_file_path = os.path.join(work_dir, os.path.basename(hotword_list_or_file))
- local_file = requests.get(hotword_list_or_file)
- open(text_file_path, "wb").write(local_file.content)
- hotword_list_or_file = text_file_path
- hotword_list = []
- hotword_str_list = []
- with codecs.open(hotword_list_or_file, 'r') as fin:
- for line in fin.readlines():
- hw = line.strip()
- hotword_str_list.append(hw)
- hotword_list.append(self.converter.tokens2ids([i for i in hw]))
- hotword_list.append([self.asr_model.sos])
- hotword_str_list.append('')
- logging.info("Initialized hotword list from file: {}, hotword list: {}."
- .format(hotword_list_or_file, hotword_str_list))
- # for text str input
- elif not hotword_list_or_file.endswith('.txt'):
- logging.info("Attempting to parse hotwords as str...")
- hotword_list = []
- hotword_str_list = []
- for hw in hotword_list_or_file.strip().split():
- hotword_str_list.append(hw)
- hotword_list.append(self.converter.tokens2ids([i for i in hw]))
- hotword_list.append([self.asr_model.sos])
- hotword_str_list.append('')
- logging.info("Hotword list: {}.".format(hotword_str_list))
- else:
- hotword_list = None
- return hotword_list
-
-
-
-def inference_modelscope(
- maxlenratio: float,
- minlenratio: float,
- batch_size: int,
- beam_size: int,
- ngpu: int,
- ctc_weight: float,
- lm_weight: float,
- penalty: float,
- log_level: Union[int, str],
- # data_path_and_name_and_type,
- asr_train_config: Optional[str],
- asr_model_file: Optional[str],
- cmvn_file: Optional[str] = None,
- lm_train_config: Optional[str] = None,
- lm_file: Optional[str] = None,
- token_type: Optional[str] = None,
- key_file: Optional[str] = None,
- word_lm_train_config: Optional[str] = None,
- bpemodel: Optional[str] = None,
- allow_variable_data_keys: bool = False,
- dtype: str = "float32",
- seed: int = 0,
- ngram_weight: float = 0.9,
- nbest: int = 1,
- num_workers: int = 1,
- output_dir: Optional[str] = None,
- timestamp_infer_config: Union[Path, str] = None,
- timestamp_model_file: Union[Path, str] = None,
- param_dict: dict = None,
- **kwargs,
-):
- assert check_argument_types()
- ncpu = kwargs.get("ncpu", 1)
- torch.set_num_threads(ncpu)
-
- if word_lm_train_config is not None:
- raise NotImplementedError("Word LM is not implemented")
- if ngpu > 1:
- raise NotImplementedError("only single GPU decoding is supported")
-
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
-
- export_mode = False
- if param_dict is not None:
- hotword_list_or_file = param_dict.get('hotword')
- export_mode = param_dict.get("export_mode", False)
- else:
- hotword_list_or_file = None
-
- if kwargs.get("device", None) == "cpu":
- ngpu = 0
- if ngpu >= 1 and torch.cuda.is_available():
- device = "cuda"
- else:
- device = "cpu"
- batch_size = 1
-
- # 1. Set random-seed
- set_all_random_seed(seed)
-
- # 2. Build speech2text
- speech2text_kwargs = dict(
- asr_train_config=asr_train_config,
- asr_model_file=asr_model_file,
- cmvn_file=cmvn_file,
- lm_train_config=lm_train_config,
- lm_file=lm_file,
- token_type=token_type,
- bpemodel=bpemodel,
- device=device,
- maxlenratio=maxlenratio,
- minlenratio=minlenratio,
- dtype=dtype,
- beam_size=beam_size,
- ctc_weight=ctc_weight,
- lm_weight=lm_weight,
- ngram_weight=ngram_weight,
- penalty=penalty,
- nbest=nbest,
- hotword_list_or_file=hotword_list_or_file,
- )
-
- speech2text = Speech2Text(**speech2text_kwargs)
-
- if timestamp_model_file is not None:
- speechtext2timestamp = SpeechText2Timestamp(
- timestamp_cmvn_file=cmvn_file,
- timestamp_model_file=timestamp_model_file,
- timestamp_infer_config=timestamp_infer_config,
- )
- else:
- speechtext2timestamp = None
-
- def _forward(
- data_path_and_name_and_type,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- output_dir_v2: Optional[str] = None,
- fs: dict = None,
- param_dict: dict = None,
- **kwargs,
- ):
-
- hotword_list_or_file = None
- if param_dict is not None:
- hotword_list_or_file = param_dict.get('hotword')
- if 'hotword' in kwargs and kwargs['hotword'] is not None:
- hotword_list_or_file = kwargs['hotword']
- if hotword_list_or_file is not None or 'hotword' in kwargs:
- speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)
-
- # 3. Build data-iterator
- if data_path_and_name_and_type is None and raw_inputs is not None:
- if isinstance(raw_inputs, torch.Tensor):
- raw_inputs = raw_inputs.numpy()
- data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
- loader = ASRTask.build_streaming_iterator(
- data_path_and_name_and_type,
- dtype=dtype,
- fs=fs,
- batch_size=batch_size,
- key_file=key_file,
- num_workers=num_workers,
- preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
- collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
- allow_variable_data_keys=allow_variable_data_keys,
- inference=True,
- )
-
- if param_dict is not None:
- use_timestamp = param_dict.get('use_timestamp', True)
- else:
- use_timestamp = True
-
- forward_time_total = 0.0
- length_total = 0.0
- finish_count = 0
- file_count = 1
- # 7 .Start for-loop
- # FIXME(kamo): The output format should be discussed about
- asr_result_list = []
- output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
- if output_path is not None:
- writer = DatadirWriter(output_path)
- else:
- writer = None
-
- for keys, batch in loader:
- assert isinstance(batch, dict), type(batch)
- assert all(isinstance(s, str) for s in keys), keys
- _bs = len(next(iter(batch.values())))
- assert len(keys) == _bs, f"{len(keys)} != {_bs}"
- # batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")}
-
- logging.info("decoding, utt_id: {}".format(keys))
- # N-best list of (text, token, token_int, hyp_object)
-
- time_beg = time.time()
- results = speech2text(**batch)
- if len(results) < 1:
- hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
- results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
- time_end = time.time()
- forward_time = time_end - time_beg
- lfr_factor = results[0][-1]
- length = results[0][-2]
- forward_time_total += forward_time
- length_total += length
- rtf_cur = "decoding, feature length: {}, forward_time: {:.4f}, rtf: {:.4f}".format(length, forward_time, 100 * forward_time / (length * lfr_factor))
- logging.info(rtf_cur)
-
- for batch_id in range(_bs):
- result = [results[batch_id][:-2]]
-
- key = keys[batch_id]
- for n, result in zip(range(1, nbest + 1), result):
- text, token, token_int, hyp = result[0], result[1], result[2], result[3]
- timestamp = result[4] if len(result[4]) > 0 else None
- # conduct timestamp prediction here
- # timestamp inference requires token length
- # thus following inference cannot be conducted in batch
- if timestamp is None and speechtext2timestamp:
- ts_batch = {}
- ts_batch['speech'] = batch['speech'][batch_id].unsqueeze(0)
- ts_batch['speech_lengths'] = torch.tensor([batch['speech_lengths'][batch_id]])
- ts_batch['text_lengths'] = torch.tensor([len(token)])
- us_alphas, us_peaks = speechtext2timestamp(**ts_batch)
- ts_str, timestamp = ts_prediction_lfr6_standard(us_alphas[0], us_peaks[0], token, force_time_shift=-3.0)
- # Create a directory: outdir/{n}best_recog
- if writer is not None:
- ibest_writer = writer[f"{n}best_recog"]
-
- # Write the result to each file
- ibest_writer["token"][key] = " ".join(token)
- # ibest_writer["token_int"][key] = " ".join(map(str, token_int))
- ibest_writer["score"][key] = str(hyp.score)
- ibest_writer["rtf"][key] = rtf_cur
-
- if text is not None:
- if use_timestamp and timestamp is not None:
- postprocessed_result = postprocess_utils.sentence_postprocess(token, timestamp)
- else:
- postprocessed_result = postprocess_utils.sentence_postprocess(token)
- timestamp_postprocessed = ""
- if len(postprocessed_result) == 3:
- text_postprocessed, timestamp_postprocessed, word_lists = postprocessed_result[0], \
- postprocessed_result[1], \
- postprocessed_result[2]
- else:
- text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
- item = {'key': key, 'value': text_postprocessed}
- if timestamp_postprocessed != "":
- item['timestamp'] = timestamp_postprocessed
- asr_result_list.append(item)
- finish_count += 1
- # asr_utils.print_progress(finish_count / file_count)
- if writer is not None:
- ibest_writer["text"][key] = " ".join(word_lists)
-
- logging.info("decoding, utt: {}, predictions: {}".format(key, text))
- rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor))
- logging.info(rtf_avg)
- if writer is not None:
- ibest_writer["rtf"]["rtf_avf"] = rtf_avg
- return asr_result_list
-
- return _forward
-
-
-def inference_modelscope_vad_punc(
- maxlenratio: float,
- minlenratio: float,
- batch_size: int,
- beam_size: int,
- ngpu: int,
- ctc_weight: float,
- lm_weight: float,
- penalty: float,
- log_level: Union[int, str],
- # data_path_and_name_and_type,
- asr_train_config: Optional[str],
- asr_model_file: Optional[str],
- cmvn_file: Optional[str] = None,
- lm_train_config: Optional[str] = None,
- lm_file: Optional[str] = None,
- token_type: Optional[str] = None,
- key_file: Optional[str] = None,
- word_lm_train_config: Optional[str] = None,
- bpemodel: Optional[str] = None,
- allow_variable_data_keys: bool = False,
- output_dir: Optional[str] = None,
- dtype: str = "float32",
- seed: int = 0,
- ngram_weight: float = 0.9,
- nbest: int = 1,
- num_workers: int = 1,
- vad_infer_config: Optional[str] = None,
- vad_model_file: Optional[str] = None,
- vad_cmvn_file: Optional[str] = None,
- time_stamp_writer: bool = True,
- punc_infer_config: Optional[str] = None,
- punc_model_file: Optional[str] = None,
- outputs_dict: Optional[bool] = True,
- param_dict: dict = None,
- **kwargs,
-):
- assert check_argument_types()
- ncpu = kwargs.get("ncpu", 1)
- torch.set_num_threads(ncpu)
-
- if word_lm_train_config is not None:
- raise NotImplementedError("Word LM is not implemented")
- if ngpu > 1:
- raise NotImplementedError("only single GPU decoding is supported")
-
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
-
- if param_dict is not None:
- hotword_list_or_file = param_dict.get('hotword')
- else:
- hotword_list_or_file = None
-
- if ngpu >= 1 and torch.cuda.is_available():
- device = "cuda"
- else:
- device = "cpu"
-
- # 1. Set random-seed
- set_all_random_seed(seed)
-
- # 2. Build speech2vadsegment
- speech2vadsegment_kwargs = dict(
- vad_infer_config=vad_infer_config,
- vad_model_file=vad_model_file,
- vad_cmvn_file=vad_cmvn_file,
- device=device,
- dtype=dtype,
- )
- # logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs))
- speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs)
-
- # 3. Build speech2text
- speech2text_kwargs = dict(
- asr_train_config=asr_train_config,
- asr_model_file=asr_model_file,
- cmvn_file=cmvn_file,
- lm_train_config=lm_train_config,
- lm_file=lm_file,
- token_type=token_type,
- bpemodel=bpemodel,
- device=device,
- maxlenratio=maxlenratio,
- minlenratio=minlenratio,
- dtype=dtype,
- beam_size=beam_size,
- ctc_weight=ctc_weight,
- lm_weight=lm_weight,
- ngram_weight=ngram_weight,
- penalty=penalty,
- nbest=nbest,
- hotword_list_or_file=hotword_list_or_file,
- )
- speech2text = Speech2Text(**speech2text_kwargs)
- text2punc = None
- if punc_model_file is not None:
- text2punc = Text2Punc(punc_infer_config, punc_model_file, device=device, dtype=dtype)
-
- if output_dir is not None:
- writer = DatadirWriter(output_dir)
- ibest_writer = writer[f"1best_recog"]
- ibest_writer["token_list"][""] = " ".join(speech2text.asr_train_args.token_list)
-
- def _forward(data_path_and_name_and_type,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- output_dir_v2: Optional[str] = None,
- fs: dict = None,
- param_dict: dict = None,
- **kwargs,
- ):
-
- hotword_list_or_file = None
- if param_dict is not None:
- hotword_list_or_file = param_dict.get('hotword')
-
- if 'hotword' in kwargs:
- hotword_list_or_file = kwargs['hotword']
-
- if speech2text.hotword_list is None:
- speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)
-
- # 3. Build data-iterator
- if data_path_and_name_and_type is None and raw_inputs is not None:
- if isinstance(raw_inputs, torch.Tensor):
- raw_inputs = raw_inputs.numpy()
- data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
- loader = ASRTask.build_streaming_iterator(
- data_path_and_name_and_type,
- dtype=dtype,
- fs=fs,
- batch_size=1,
- key_file=key_file,
- num_workers=num_workers,
- preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False),
- collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False),
- allow_variable_data_keys=allow_variable_data_keys,
- inference=True,
- )
-
- if param_dict is not None:
- use_timestamp = param_dict.get('use_timestamp', True)
- else:
- use_timestamp = True
-
- finish_count = 0
- file_count = 1
- lfr_factor = 6
- # 7 .Start for-loop
- asr_result_list = []
- output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
- writer = None
- if output_path is not None:
- writer = DatadirWriter(output_path)
- ibest_writer = writer[f"1best_recog"]
-
- for keys, batch in loader:
- assert isinstance(batch, dict), type(batch)
- assert all(isinstance(s, str) for s in keys), keys
- _bs = len(next(iter(batch.values())))
- assert len(keys) == _bs, f"{len(keys)} != {_bs}"
-
- vad_results = speech2vadsegment(**batch)
- _, vadsegments = vad_results[0], vad_results[1][0]
-
- speech, speech_lengths = batch["speech"], batch["speech_lengths"]
-
- n = len(vadsegments)
- data_with_index = [(vadsegments[i], i) for i in range(n)]
- sorted_data = sorted(data_with_index, key=lambda x: x[0][1] - x[0][0])
- results_sorted = []
- for j, beg_idx in enumerate(range(0, n, batch_size)):
- end_idx = min(n, beg_idx + batch_size)
- speech_j, speech_lengths_j = slice_padding_fbank(speech, speech_lengths, sorted_data[beg_idx:end_idx])
-
- batch = {"speech": speech_j, "speech_lengths": speech_lengths_j}
- batch = to_device(batch, device=device)
- results = speech2text(**batch)
-
- if len(results) < 1:
- results = [["", [], [], [], [], [], []]]
- results_sorted.extend(results)
- restored_data = [0] * n
- for j in range(n):
- index = sorted_data[j][1]
- restored_data[index] = results_sorted[j]
- result = ["", [], [], [], [], [], []]
- for j in range(n):
- result[0] += restored_data[j][0]
- result[1] += restored_data[j][1]
- result[2] += restored_data[j][2]
- if len(restored_data[j][4]) > 0:
- for t in restored_data[j][4]:
- t[0] += vadsegments[j][0]
- t[1] += vadsegments[j][0]
- result[4] += restored_data[j][4]
- # result = [result[k]+restored_data[j][k] for k in range(len(result[:-2]))]
-
- key = keys[0]
- # result = result_segments[0]
- text, token, token_int = result[0], result[1], result[2]
- time_stamp = result[4] if len(result[4]) > 0 else None
-
- if use_timestamp and time_stamp is not None:
- postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
- else:
- postprocessed_result = postprocess_utils.sentence_postprocess(token)
- text_postprocessed = ""
- time_stamp_postprocessed = ""
- text_postprocessed_punc = postprocessed_result
- if len(postprocessed_result) == 3:
- text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
- postprocessed_result[1], \
- postprocessed_result[2]
- else:
- text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
-
- text_postprocessed_punc = text_postprocessed
- punc_id_list = []
- if len(word_lists) > 0 and text2punc is not None:
- text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)
-
- item = {'key': key, 'value': text_postprocessed_punc}
- if text_postprocessed != "":
- item['text_postprocessed'] = text_postprocessed
- if time_stamp_postprocessed != "":
- item['time_stamp'] = time_stamp_postprocessed
-
- item['sentences'] = time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocessed)
-
- asr_result_list.append(item)
- finish_count += 1
- # asr_utils.print_progress(finish_count / file_count)
- if writer is not None:
- # Write the result to each file
- ibest_writer["token"][key] = " ".join(token)
- ibest_writer["token_int"][key] = " ".join(map(str, token_int))
- ibest_writer["vad"][key] = "{}".format(vadsegments)
- ibest_writer["text"][key] = " ".join(word_lists)
- ibest_writer["text_with_punc"][key] = text_postprocessed_punc
- if time_stamp_postprocessed is not None:
- ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)
-
- logging.info("decoding, utt: {}, predictions: {}".format(key, text_postprocessed_punc))
- return asr_result_list
-
- return _forward
-
-
-def get_parser():
- parser = config_argparse.ArgumentParser(
- description="ASR Decoding",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- )
-
- # Note(kamo): Use '_' instead of '-' as separator.
- # '-' is confusing if written in yaml.
- parser.add_argument(
- "--log_level",
- type=lambda x: x.upper(),
- default="INFO",
- choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
- help="The verbose level of logging",
- )
-
- parser.add_argument("--output_dir", type=str, required=True)
- parser.add_argument(
- "--ngpu",
- type=int,
- default=0,
- help="The number of gpus. 0 indicates CPU mode",
- )
- parser.add_argument("--seed", type=int, default=0, help="Random seed")
- parser.add_argument(
- "--dtype",
- default="float32",
- choices=["float16", "float32", "float64"],
- help="Data type",
- )
- parser.add_argument(
- "--num_workers",
- type=int,
- default=1,
- help="The number of workers used for DataLoader",
- )
- parser.add_argument(
- "--hotword",
- type=str_or_none,
- default=None,
- help="hotword file path or hotwords seperated by space"
- )
- group = parser.add_argument_group("Input data related")
- group.add_argument(
- "--data_path_and_name_and_type",
- type=str2triple_str,
- required=False,
- action="append",
- )
- group.add_argument("--key_file", type=str_or_none)
- group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
-
- group = parser.add_argument_group("The model configuration related")
- group.add_argument(
- "--asr_train_config",
- type=str,
- help="ASR training configuration",
- )
- group.add_argument(
- "--asr_model_file",
- type=str,
- help="ASR model parameter file",
- )
- group.add_argument(
- "--cmvn_file",
- type=str,
- help="Global cmvn file",
- )
- group.add_argument(
- "--lm_train_config",
- type=str,
- help="LM training configuration",
- )
- group.add_argument(
- "--lm_file",
- type=str,
- help="LM parameter file",
- )
- group.add_argument(
- "--word_lm_train_config",
- type=str,
- help="Word LM training configuration",
- )
- group.add_argument(
- "--word_lm_file",
- type=str,
- help="Word LM parameter file",
- )
- group.add_argument(
- "--ngram_file",
- type=str,
- help="N-gram parameter file",
- )
- group.add_argument(
- "--model_tag",
- type=str,
- help="Pretrained model tag. If specify this option, *_train_config and "
- "*_file will be overwritten",
- )
-
- group = parser.add_argument_group("Beam-search related")
- group.add_argument(
- "--batch_size",
- type=int,
- default=1,
- help="The batch size for inference",
- )
- group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
- group.add_argument("--beam_size", type=int, default=20, help="Beam size")
- group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
- group.add_argument(
- "--maxlenratio",
- type=float,
- default=0.0,
- help="Input length ratio to obtain max output length. "
- "If maxlenratio=0.0 (default), it uses a end-detect "
- "function "
- "to automatically find maximum hypothesis lengths."
- "If maxlenratio<0.0, its absolute value is interpreted"
- "as a constant max output length",
- )
- group.add_argument(
- "--minlenratio",
- type=float,
- default=0.0,
- help="Input length ratio to obtain min output length",
- )
- group.add_argument(
- "--ctc_weight",
- type=float,
- default=0.5,
- help="CTC weight in joint decoding",
- )
- group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
- group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
- group.add_argument("--streaming", type=str2bool, default=False)
-
- group.add_argument(
- "--frontend_conf",
- default=None,
- help="",
- )
- group.add_argument("--raw_inputs", type=list, default=None)
- # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
-
- group = parser.add_argument_group("Text converter related")
- group.add_argument(
- "--token_type",
- type=str_or_none,
- default=None,
- choices=["char", "bpe", None],
- help="The token type for ASR model. "
- "If not given, refers from the training args",
- )
- group.add_argument(
- "--bpemodel",
- type=str_or_none,
- default=None,
- help="The model path of sentencepiece. "
- "If not given, refers from the training args",
- )
-
- return parser
-
-
-def main(cmd=None):
- print(get_commandline_args(), file=sys.stderr)
- parser = get_parser()
- args = parser.parse_args(cmd)
- param_dict = {'hotword': args.hotword}
- kwargs = vars(args)
- kwargs.pop("config", None)
- kwargs['param_dict'] = param_dict
- inference_pipeline = inference_modelscope(**kwargs)
- return inference_pipeline(kwargs["data_path_and_name_and_type"], param_dict=param_dict)
-
-
-if __name__ == "__main__":
- main()
diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py
deleted file mode 100644
index 4f04d02e3..000000000
--- a/funasr/bin/asr_inference_paraformer_streaming.py
+++ /dev/null
@@ -1,749 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import logging
-import sys
-import time
-import copy
-import os
-import codecs
-import tempfile
-import requests
-import yaml
-from pathlib import Path
-from typing import Optional
-from typing import Sequence
-from typing import Tuple
-from typing import Union
-from typing import Dict
-from typing import Any
-from typing import List
-
-import numpy as np
-import torch
-import torchaudio
-from typeguard import check_argument_types
-
-from funasr.fileio.datadir_writer import DatadirWriter
-from funasr.modules.beam_search.beam_search import BeamSearchPara as BeamSearch
-from funasr.modules.beam_search.beam_search import Hypothesis
-from funasr.modules.scorers.ctc import CTCPrefixScorer
-from funasr.modules.scorers.length_bonus import LengthBonus
-from funasr.modules.subsampling import TooShortUttError
-from funasr.tasks.asr import ASRTaskParaformer as ASRTask
-from funasr.tasks.lm import LMTask
-from funasr.text.build_tokenizer import build_tokenizer
-from funasr.text.token_id_converter import TokenIDConverter
-from funasr.torch_utils.device_funcs import to_device
-from funasr.torch_utils.set_all_random_seed import set_all_random_seed
-from funasr.utils import config_argparse
-from funasr.utils.cli_utils import get_commandline_args
-from funasr.utils.types import str2bool
-from funasr.utils.types import str2triple_str
-from funasr.utils.types import str_or_none
-from funasr.utils import asr_utils, wav_utils, postprocess_utils
-from funasr.models.frontend.wav_frontend import WavFrontend, WavFrontendOnline
-from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
-
-np.set_printoptions(threshold=np.inf)
-
-
-class Speech2Text:
- """Speech2Text class
-
- Examples:
- >>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
- >>> audio, rate = soundfile.read("speech.wav")
- >>> speech2text(audio)
- [(text, token, token_int, hypothesis object), ...]
-
- """
-
- def __init__(
- self,
- asr_train_config: Union[Path, str] = None,
- asr_model_file: Union[Path, str] = None,
- cmvn_file: Union[Path, str] = None,
- lm_train_config: Union[Path, str] = None,
- lm_file: Union[Path, str] = None,
- token_type: str = None,
- bpemodel: str = None,
- device: str = "cpu",
- maxlenratio: float = 0.0,
- minlenratio: float = 0.0,
- dtype: str = "float32",
- beam_size: int = 20,
- ctc_weight: float = 0.5,
- lm_weight: float = 1.0,
- ngram_weight: float = 0.9,
- penalty: float = 0.0,
- nbest: int = 1,
- frontend_conf: dict = None,
- hotword_list_or_file: str = None,
- **kwargs,
- ):
- assert check_argument_types()
-
- # 1. Build ASR model
- scorers = {}
- asr_model, asr_train_args = ASRTask.build_model_from_file(
- asr_train_config, asr_model_file, cmvn_file, device
- )
- frontend = None
- if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
- frontend = WavFrontendOnline(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
-
- logging.info("asr_model: {}".format(asr_model))
- logging.info("asr_train_args: {}".format(asr_train_args))
- asr_model.to(dtype=getattr(torch, dtype)).eval()
-
- if asr_model.ctc != None:
- ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
- scorers.update(
- ctc=ctc
- )
- token_list = asr_model.token_list
- scorers.update(
- length_bonus=LengthBonus(len(token_list)),
- )
-
- # 2. Build Language model
- if lm_train_config is not None:
- lm, lm_train_args = LMTask.build_model_from_file(
- lm_train_config, lm_file, device
- )
- scorers["lm"] = lm.lm
-
- # 3. Build ngram model
- # ngram is not supported now
- ngram = None
- scorers["ngram"] = ngram
-
- # 4. Build BeamSearch object
- # transducer is not supported now
- beam_search_transducer = None
-
- weights = dict(
- decoder=1.0 - ctc_weight,
- ctc=ctc_weight,
- lm=lm_weight,
- ngram=ngram_weight,
- length_bonus=penalty,
- )
- beam_search = BeamSearch(
- beam_size=beam_size,
- weights=weights,
- scorers=scorers,
- sos=asr_model.sos,
- eos=asr_model.eos,
- vocab_size=len(token_list),
- token_list=token_list,
- pre_beam_score_key=None if ctc_weight == 1.0 else "full",
- )
-
- beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
- for scorer in scorers.values():
- if isinstance(scorer, torch.nn.Module):
- scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
-
- logging.info(f"Decoding device={device}, dtype={dtype}")
-
- # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
- if token_type is None:
- token_type = asr_train_args.token_type
- if bpemodel is None:
- bpemodel = asr_train_args.bpemodel
-
- if token_type is None:
- tokenizer = None
- elif token_type == "bpe":
- if bpemodel is not None:
- tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
- else:
- tokenizer = None
- else:
- tokenizer = build_tokenizer(token_type=token_type)
- converter = TokenIDConverter(token_list=token_list)
- logging.info(f"Text tokenizer: {tokenizer}")
-
- self.asr_model = asr_model
- self.asr_train_args = asr_train_args
- self.converter = converter
- self.tokenizer = tokenizer
-
- # 6. [Optional] Build hotword list from str, local file or url
-
- is_use_lm = lm_weight != 0.0 and lm_file is not None
- if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
- beam_search = None
- self.beam_search = beam_search
- logging.info(f"Beam_search: {self.beam_search}")
- self.beam_search_transducer = beam_search_transducer
- self.maxlenratio = maxlenratio
- self.minlenratio = minlenratio
- self.device = device
- self.dtype = dtype
- self.nbest = nbest
- self.frontend = frontend
- self.encoder_downsampling_factor = 1
- if asr_train_args.encoder == "data2vec_encoder" or asr_train_args.encoder_conf["input_layer"] == "conv2d":
- self.encoder_downsampling_factor = 4
-
- @torch.no_grad()
- def __call__(
- self, cache: dict, speech: Union[torch.Tensor], speech_lengths: Union[torch.Tensor] = None
- ):
- """Inference
-
- Args:
- speech: Input speech data
- Returns:
- text, token, token_int, hyp
-
- """
- assert check_argument_types()
- results = []
- cache_en = cache["encoder"]
- if speech.shape[1] < 16 * 60 and cache_en["is_final"]:
- if cache_en["start_idx"] == 0:
- return []
- cache_en["tail_chunk"] = True
- feats = cache_en["feats"]
- feats_len = torch.tensor([feats.shape[1]])
- self.asr_model.frontend = None
- results = self.infer(feats, feats_len, cache)
- return results
- else:
- if self.frontend is not None:
- feats, feats_len = self.frontend.forward(speech, speech_lengths, cache_en["is_final"])
- feats = to_device(feats, device=self.device)
- feats_len = feats_len.int()
- self.asr_model.frontend = None
- else:
- feats = speech
- feats_len = speech_lengths
-
- if feats.shape[1] != 0:
- if cache_en["is_final"]:
- if feats.shape[1] + cache_en["chunk_size"][2] < cache_en["chunk_size"][1]:
- cache_en["last_chunk"] = True
- else:
- # first chunk
- feats_chunk1 = feats[:, :cache_en["chunk_size"][1], :]
- feats_len = torch.tensor([feats_chunk1.shape[1]])
- results_chunk1 = self.infer(feats_chunk1, feats_len, cache)
-
- # last chunk
- cache_en["last_chunk"] = True
- feats_chunk2 = feats[:, -(feats.shape[1] + cache_en["chunk_size"][2] - cache_en["chunk_size"][1]):, :]
- feats_len = torch.tensor([feats_chunk2.shape[1]])
- results_chunk2 = self.infer(feats_chunk2, feats_len, cache)
-
- return [" ".join(results_chunk1 + results_chunk2)]
-
- results = self.infer(feats, feats_len, cache)
-
- return results
-
- @torch.no_grad()
- def infer(self, feats: Union[torch.Tensor], feats_len: Union[torch.Tensor], cache: List = None):
- batch = {"speech": feats, "speech_lengths": feats_len}
- batch = to_device(batch, device=self.device)
- # b. Forward Encoder
- enc, enc_len = self.asr_model.encode_chunk(feats, feats_len, cache=cache)
- if isinstance(enc, tuple):
- enc = enc[0]
- # assert len(enc) == 1, len(enc)
- enc_len_batch_total = torch.sum(enc_len).item() * self.encoder_downsampling_factor
-
- predictor_outs = self.asr_model.calc_predictor_chunk(enc, cache)
- pre_acoustic_embeds, pre_token_length= predictor_outs[0], predictor_outs[1]
- if torch.max(pre_token_length) < 1:
- return []
- decoder_outs = self.asr_model.cal_decoder_with_predictor_chunk(enc, pre_acoustic_embeds, cache)
- decoder_out = decoder_outs
-
- results = []
- b, n, d = decoder_out.size()
- for i in range(b):
- x = enc[i, :enc_len[i], :]
- am_scores = decoder_out[i, :pre_token_length[i], :]
- if self.beam_search is not None:
- nbest_hyps = self.beam_search(
- x=x, am_scores=am_scores, maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
- )
-
- nbest_hyps = nbest_hyps[: self.nbest]
- else:
- yseq = am_scores.argmax(dim=-1)
- score = am_scores.max(dim=-1)[0]
- score = torch.sum(score, dim=-1)
- # pad with mask tokens to ensure compatibility with sos/eos tokens
- yseq = torch.tensor(
- [self.asr_model.sos] + yseq.tolist() + [self.asr_model.eos], device=yseq.device
- )
- nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
-
- for hyp in nbest_hyps:
- assert isinstance(hyp, (Hypothesis)), type(hyp)
-
- # remove sos/eos and get results
- last_pos = -1
- if isinstance(hyp.yseq, list):
- token_int = hyp.yseq[1:last_pos]
- else:
- token_int = hyp.yseq[1:last_pos].tolist()
-
- # remove blank symbol id, which is assumed to be 0
- token_int = list(filter(lambda x: x != 0 and x != 2, token_int))
-
- # Change integer-ids to tokens
- token = self.converter.ids2tokens(token_int)
- token = " ".join(token)
-
- results.append(token)
-
- # assert check_return_type(results)
- return results
-
-
-def inference(
- maxlenratio: float,
- minlenratio: float,
- batch_size: int,
- beam_size: int,
- ngpu: int,
- ctc_weight: float,
- lm_weight: float,
- penalty: float,
- log_level: Union[int, str],
- data_path_and_name_and_type,
- asr_train_config: Optional[str],
- asr_model_file: Optional[str],
- cmvn_file: Optional[str] = None,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- lm_train_config: Optional[str] = None,
- lm_file: Optional[str] = None,
- token_type: Optional[str] = None,
- key_file: Optional[str] = None,
- word_lm_train_config: Optional[str] = None,
- bpemodel: Optional[str] = None,
- allow_variable_data_keys: bool = False,
- streaming: bool = False,
- output_dir: Optional[str] = None,
- dtype: str = "float32",
- seed: int = 0,
- ngram_weight: float = 0.9,
- nbest: int = 1,
- num_workers: int = 1,
-
- **kwargs,
-):
- inference_pipeline = inference_modelscope(
- maxlenratio=maxlenratio,
- minlenratio=minlenratio,
- batch_size=batch_size,
- beam_size=beam_size,
- ngpu=ngpu,
- ctc_weight=ctc_weight,
- lm_weight=lm_weight,
- penalty=penalty,
- log_level=log_level,
- asr_train_config=asr_train_config,
- asr_model_file=asr_model_file,
- cmvn_file=cmvn_file,
- raw_inputs=raw_inputs,
- lm_train_config=lm_train_config,
- lm_file=lm_file,
- token_type=token_type,
- key_file=key_file,
- word_lm_train_config=word_lm_train_config,
- bpemodel=bpemodel,
- allow_variable_data_keys=allow_variable_data_keys,
- streaming=streaming,
- output_dir=output_dir,
- dtype=dtype,
- seed=seed,
- ngram_weight=ngram_weight,
- nbest=nbest,
- num_workers=num_workers,
-
- **kwargs,
- )
- return inference_pipeline(data_path_and_name_and_type, raw_inputs)
-
-
-def inference_modelscope(
- maxlenratio: float,
- minlenratio: float,
- batch_size: int,
- beam_size: int,
- ngpu: int,
- ctc_weight: float,
- lm_weight: float,
- penalty: float,
- log_level: Union[int, str],
- # data_path_and_name_and_type,
- asr_train_config: Optional[str],
- asr_model_file: Optional[str],
- cmvn_file: Optional[str] = None,
- lm_train_config: Optional[str] = None,
- lm_file: Optional[str] = None,
- token_type: Optional[str] = None,
- key_file: Optional[str] = None,
- word_lm_train_config: Optional[str] = None,
- bpemodel: Optional[str] = None,
- allow_variable_data_keys: bool = False,
- dtype: str = "float32",
- seed: int = 0,
- ngram_weight: float = 0.9,
- nbest: int = 1,
- num_workers: int = 1,
- output_dir: Optional[str] = None,
- param_dict: dict = None,
- **kwargs,
-):
- assert check_argument_types()
-
- if word_lm_train_config is not None:
- raise NotImplementedError("Word LM is not implemented")
- if ngpu > 1:
- raise NotImplementedError("only single GPU decoding is supported")
-
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
-
- export_mode = False
-
- if ngpu >= 1 and torch.cuda.is_available():
- device = "cuda"
- else:
- device = "cpu"
- batch_size = 1
-
- # 1. Set random-seed
- set_all_random_seed(seed)
-
- # 2. Build speech2text
- speech2text_kwargs = dict(
- asr_train_config=asr_train_config,
- asr_model_file=asr_model_file,
- cmvn_file=cmvn_file,
- lm_train_config=lm_train_config,
- lm_file=lm_file,
- token_type=token_type,
- bpemodel=bpemodel,
- device=device,
- maxlenratio=maxlenratio,
- minlenratio=minlenratio,
- dtype=dtype,
- beam_size=beam_size,
- ctc_weight=ctc_weight,
- lm_weight=lm_weight,
- ngram_weight=ngram_weight,
- penalty=penalty,
- nbest=nbest,
- )
-
- speech2text = Speech2Text(**speech2text_kwargs)
-
- def _load_bytes(input):
- middle_data = np.frombuffer(input, dtype=np.int16)
- middle_data = np.asarray(middle_data)
- if middle_data.dtype.kind not in 'iu':
- raise TypeError("'middle_data' must be an array of integers")
- dtype = np.dtype('float32')
- if dtype.kind != 'f':
- raise TypeError("'dtype' must be a floating point type")
-
- i = np.iinfo(middle_data.dtype)
- abs_max = 2 ** (i.bits - 1)
- offset = i.min + abs_max
- array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
- return array
-
- def _read_yaml(yaml_path: Union[str, Path]) -> Dict:
- if not Path(yaml_path).exists():
- raise FileExistsError(f'The {yaml_path} does not exist.')
-
- with open(str(yaml_path), 'rb') as f:
- data = yaml.load(f, Loader=yaml.Loader)
- return data
-
- def _prepare_cache(cache: dict = {}, chunk_size=[5,10,5], batch_size=1):
- if len(cache) > 0:
- return cache
- config = _read_yaml(asr_train_config)
- enc_output_size = config["encoder_conf"]["output_size"]
- feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
- cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
- "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
- "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False}
- cache["encoder"] = cache_en
-
- cache_de = {"decode_fsmn": None}
- cache["decoder"] = cache_de
-
- return cache
-
- def _cache_reset(cache: dict = {}, chunk_size=[5,10,5], batch_size=1):
- if len(cache) > 0:
- config = _read_yaml(asr_train_config)
- enc_output_size = config["encoder_conf"]["output_size"]
- feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
- cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
- "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
- "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False}
- cache["encoder"] = cache_en
-
- cache_de = {"decode_fsmn": None}
- cache["decoder"] = cache_de
-
- return cache
-
- def _forward(
- data_path_and_name_and_type,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- output_dir_v2: Optional[str] = None,
- fs: dict = None,
- param_dict: dict = None,
- **kwargs,
- ):
-
- # 3. Build data-iterator
- if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "bytes":
- raw_inputs = _load_bytes(data_path_and_name_and_type[0])
- raw_inputs = torch.tensor(raw_inputs)
- if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "sound":
- raw_inputs = torchaudio.load(data_path_and_name_and_type[0])[0][0]
- if data_path_and_name_and_type is None and raw_inputs is not None:
- if isinstance(raw_inputs, np.ndarray):
- raw_inputs = torch.tensor(raw_inputs)
- is_final = False
- cache = {}
- chunk_size = [5, 10, 5]
- if param_dict is not None and "cache" in param_dict:
- cache = param_dict["cache"]
- if param_dict is not None and "is_final" in param_dict:
- is_final = param_dict["is_final"]
- if param_dict is not None and "chunk_size" in param_dict:
- chunk_size = param_dict["chunk_size"]
-
- # 7 .Start for-loop
- # FIXME(kamo): The output format should be discussed about
- raw_inputs = torch.unsqueeze(raw_inputs, axis=0)
- asr_result_list = []
- cache = _prepare_cache(cache, chunk_size=chunk_size, batch_size=1)
- item = {}
- if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "sound":
- sample_offset = 0
- speech_length = raw_inputs.shape[1]
- stride_size = chunk_size[1] * 960
- cache = _prepare_cache(cache, chunk_size=chunk_size, batch_size=1)
- final_result = ""
- for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)):
- if sample_offset + stride_size >= speech_length - 1:
- stride_size = speech_length - sample_offset
- cache["encoder"]["is_final"] = True
- else:
- cache["encoder"]["is_final"] = False
- input_lens = torch.tensor([stride_size])
- asr_result = speech2text(cache, raw_inputs[:, sample_offset: sample_offset + stride_size], input_lens)
- if len(asr_result) != 0:
- final_result += " ".join(asr_result) + " "
- item = {'key': "utt", 'value': final_result.strip()}
- else:
- input_lens = torch.tensor([raw_inputs.shape[1]])
- cache["encoder"]["is_final"] = is_final
- asr_result = speech2text(cache, raw_inputs, input_lens)
- item = {'key': "utt", 'value': " ".join(asr_result)}
-
- asr_result_list.append(item)
- if is_final:
- cache = _cache_reset(cache, chunk_size=chunk_size, batch_size=1)
- return asr_result_list
-
- return _forward
-
-
-def get_parser():
- parser = config_argparse.ArgumentParser(
- description="ASR Decoding",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- )
-
- # Note(kamo): Use '_' instead of '-' as separator.
- # '-' is confusing if written in yaml.
- parser.add_argument(
- "--log_level",
- type=lambda x: x.upper(),
- default="INFO",
- choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
- help="The verbose level of logging",
- )
-
- parser.add_argument("--output_dir", type=str, required=True)
- parser.add_argument(
- "--ngpu",
- type=int,
- default=0,
- help="The number of gpus. 0 indicates CPU mode",
- )
- parser.add_argument("--seed", type=int, default=0, help="Random seed")
- parser.add_argument(
- "--dtype",
- default="float32",
- choices=["float16", "float32", "float64"],
- help="Data type",
- )
- parser.add_argument(
- "--num_workers",
- type=int,
- default=1,
- help="The number of workers used for DataLoader",
- )
- parser.add_argument(
- "--hotword",
- type=str_or_none,
- default=None,
- help="hotword file path or hotwords seperated by space"
- )
- group = parser.add_argument_group("Input data related")
- group.add_argument(
- "--data_path_and_name_and_type",
- type=str2triple_str,
- required=False,
- action="append",
- )
- group.add_argument("--key_file", type=str_or_none)
- group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
-
- group = parser.add_argument_group("The model configuration related")
- group.add_argument(
- "--asr_train_config",
- type=str,
- help="ASR training configuration",
- )
- group.add_argument(
- "--asr_model_file",
- type=str,
- help="ASR model parameter file",
- )
- group.add_argument(
- "--cmvn_file",
- type=str,
- help="Global cmvn file",
- )
- group.add_argument(
- "--lm_train_config",
- type=str,
- help="LM training configuration",
- )
- group.add_argument(
- "--lm_file",
- type=str,
- help="LM parameter file",
- )
- group.add_argument(
- "--word_lm_train_config",
- type=str,
- help="Word LM training configuration",
- )
- group.add_argument(
- "--word_lm_file",
- type=str,
- help="Word LM parameter file",
- )
- group.add_argument(
- "--ngram_file",
- type=str,
- help="N-gram parameter file",
- )
- group.add_argument(
- "--model_tag",
- type=str,
- help="Pretrained model tag. If specify this option, *_train_config and "
- "*_file will be overwritten",
- )
-
- group = parser.add_argument_group("Beam-search related")
- group.add_argument(
- "--batch_size",
- type=int,
- default=1,
- help="The batch size for inference",
- )
- group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
- group.add_argument("--beam_size", type=int, default=20, help="Beam size")
- group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
- group.add_argument(
- "--maxlenratio",
- type=float,
- default=0.0,
- help="Input length ratio to obtain max output length. "
- "If maxlenratio=0.0 (default), it uses a end-detect "
- "function "
- "to automatically find maximum hypothesis lengths."
- "If maxlenratio<0.0, its absolute value is interpreted"
- "as a constant max output length",
- )
- group.add_argument(
- "--minlenratio",
- type=float,
- default=0.0,
- help="Input length ratio to obtain min output length",
- )
- group.add_argument(
- "--ctc_weight",
- type=float,
- default=0.5,
- help="CTC weight in joint decoding",
- )
- group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
- group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
- group.add_argument("--streaming", type=str2bool, default=False)
-
- group.add_argument(
- "--frontend_conf",
- default=None,
- help="",
- )
- group.add_argument("--raw_inputs", type=list, default=None)
- # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
-
- group = parser.add_argument_group("Text converter related")
- group.add_argument(
- "--token_type",
- type=str_or_none,
- default=None,
- choices=["char", "bpe", None],
- help="The token type for ASR model. "
- "If not given, refers from the training args",
- )
- group.add_argument(
- "--bpemodel",
- type=str_or_none,
- default=None,
- help="The model path of sentencepiece. "
- "If not given, refers from the training args",
- )
-
- return parser
-
-
-def main(cmd=None):
- print(get_commandline_args(), file=sys.stderr)
- parser = get_parser()
- args = parser.parse_args(cmd)
- param_dict = {'hotword': args.hotword}
- kwargs = vars(args)
- kwargs.pop("config", None)
- kwargs['param_dict'] = param_dict
- inference(**kwargs)
-
-
-if __name__ == "__main__":
- main()
-
diff --git a/funasr/bin/asr_inference_rnnt.py b/funasr/bin/asr_inference_rnnt.py
deleted file mode 100644
index bd36907f7..000000000
--- a/funasr/bin/asr_inference_rnnt.py
+++ /dev/null
@@ -1,734 +0,0 @@
-#!/usr/bin/env python3
-
-""" Inference class definition for Transducer models."""
-
-from __future__ import annotations
-
-import argparse
-import logging
-import math
-import sys
-from pathlib import Path
-from typing import Any, Dict, List, Optional, Sequence, Tuple, Union
-
-import numpy as np
-import torch
-from packaging.version import parse as V
-from typeguard import check_argument_types, check_return_type
-
-from funasr.modules.beam_search.beam_search_transducer import (
- BeamSearchTransducer,
- Hypothesis,
-)
-from funasr.modules.nets_utils import TooShortUttError
-from funasr.fileio.datadir_writer import DatadirWriter
-from funasr.tasks.asr import ASRTransducerTask
-from funasr.tasks.lm import LMTask
-from funasr.text.build_tokenizer import build_tokenizer
-from funasr.text.token_id_converter import TokenIDConverter
-from funasr.torch_utils.device_funcs import to_device
-from funasr.torch_utils.set_all_random_seed import set_all_random_seed
-from funasr.utils import config_argparse
-from funasr.utils.types import str2bool, str2triple_str, str_or_none
-from funasr.utils.cli_utils import get_commandline_args
-from funasr.models.frontend.wav_frontend import WavFrontend
-
-class Speech2Text:
- """Speech2Text class for Transducer models.
- Args:
- asr_train_config: ASR model training config path.
- asr_model_file: ASR model path.
- beam_search_config: Beam search config path.
- lm_train_config: Language Model training config path.
- lm_file: Language Model config path.
- token_type: Type of token units.
- bpemodel: BPE model path.
- device: Device to use for inference.
- beam_size: Size of beam during search.
- dtype: Data type.
- lm_weight: Language model weight.
- quantize_asr_model: Whether to apply dynamic quantization to ASR model.
- quantize_modules: List of module names to apply dynamic quantization on.
- quantize_dtype: Dynamic quantization data type.
- nbest: Number of final hypothesis.
- streaming: Whether to perform chunk-by-chunk inference.
- chunk_size: Number of frames in chunk AFTER subsampling.
- left_context: Number of frames in left context AFTER subsampling.
- right_context: Number of frames in right context AFTER subsampling.
- display_partial_hypotheses: Whether to display partial hypotheses.
- """
-
- def __init__(
- self,
- asr_train_config: Union[Path, str] = None,
- asr_model_file: Union[Path, str] = None,
- cmvn_file: Union[Path, str] = None,
- beam_search_config: Dict[str, Any] = None,
- lm_train_config: Union[Path, str] = None,
- lm_file: Union[Path, str] = None,
- token_type: str = None,
- bpemodel: str = None,
- device: str = "cpu",
- beam_size: int = 5,
- dtype: str = "float32",
- lm_weight: float = 1.0,
- quantize_asr_model: bool = False,
- quantize_modules: List[str] = None,
- quantize_dtype: str = "qint8",
- nbest: int = 1,
- streaming: bool = False,
- simu_streaming: bool = False,
- chunk_size: int = 16,
- left_context: int = 32,
- right_context: int = 0,
- display_partial_hypotheses: bool = False,
- ) -> None:
- """Construct a Speech2Text object."""
- super().__init__()
-
- assert check_argument_types()
- asr_model, asr_train_args = ASRTransducerTask.build_model_from_file(
- asr_train_config, asr_model_file, cmvn_file, device
- )
-
- frontend = None
- if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
- frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
-
- if quantize_asr_model:
- if quantize_modules is not None:
- if not all([q in ["LSTM", "Linear"] for q in quantize_modules]):
- raise ValueError(
- "Only 'Linear' and 'LSTM' modules are currently supported"
- " by PyTorch and in --quantize_modules"
- )
-
- q_config = set([getattr(torch.nn, q) for q in quantize_modules])
- else:
- q_config = {torch.nn.Linear}
-
- if quantize_dtype == "float16" and (V(torch.__version__) < V("1.5.0")):
- raise ValueError(
- "float16 dtype for dynamic quantization is not supported with torch"
- " version < 1.5.0. Switching to qint8 dtype instead."
- )
- q_dtype = getattr(torch, quantize_dtype)
-
- asr_model = torch.quantization.quantize_dynamic(
- asr_model, q_config, dtype=q_dtype
- ).eval()
- else:
- asr_model.to(dtype=getattr(torch, dtype)).eval()
-
- if lm_train_config is not None:
- lm, lm_train_args = LMTask.build_model_from_file(
- lm_train_config, lm_file, device
- )
- lm_scorer = lm.lm
- else:
- lm_scorer = None
-
- # 4. Build BeamSearch object
- if beam_search_config is None:
- beam_search_config = {}
-
- beam_search = BeamSearchTransducer(
- asr_model.decoder,
- asr_model.joint_network,
- beam_size,
- lm=lm_scorer,
- lm_weight=lm_weight,
- nbest=nbest,
- **beam_search_config,
- )
-
- token_list = asr_model.token_list
-
- if token_type is None:
- token_type = asr_train_args.token_type
- if bpemodel is None:
- bpemodel = asr_train_args.bpemodel
-
- if token_type is None:
- tokenizer = None
- elif token_type == "bpe":
- if bpemodel is not None:
- tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
- else:
- tokenizer = None
- else:
- tokenizer = build_tokenizer(token_type=token_type)
- converter = TokenIDConverter(token_list=token_list)
- logging.info(f"Text tokenizer: {tokenizer}")
-
- self.asr_model = asr_model
- self.asr_train_args = asr_train_args
- self.device = device
- self.dtype = dtype
- self.nbest = nbest
-
- self.converter = converter
- self.tokenizer = tokenizer
-
- self.beam_search = beam_search
- self.streaming = streaming
- self.simu_streaming = simu_streaming
- self.chunk_size = max(chunk_size, 0)
- self.left_context = left_context
- self.right_context = max(right_context, 0)
-
- if not streaming or chunk_size == 0:
- self.streaming = False
- self.asr_model.encoder.dynamic_chunk_training = False
-
- if not simu_streaming or chunk_size == 0:
- self.simu_streaming = False
- self.asr_model.encoder.dynamic_chunk_training = False
-
- self.frontend = frontend
- self.window_size = self.chunk_size + self.right_context
-
- if self.streaming:
- self._ctx = self.asr_model.encoder.get_encoder_input_size(
- self.window_size
- )
-
- self.last_chunk_length = (
- self.asr_model.encoder.embed.min_frame_length + self.right_context + 1
- )
- self.reset_inference_cache()
-
- def reset_inference_cache(self) -> None:
- """Reset Speech2Text parameters."""
- self.frontend_cache = None
-
- self.asr_model.encoder.reset_streaming_cache(
- self.left_context, device=self.device
- )
- self.beam_search.reset_inference_cache()
-
- self.num_processed_frames = torch.tensor([[0]], device=self.device)
-
- @torch.no_grad()
- def streaming_decode(
- self,
- speech: Union[torch.Tensor, np.ndarray],
- is_final: bool = True,
- ) -> List[Hypothesis]:
- """Speech2Text streaming call.
- Args:
- speech: Chunk of speech data. (S)
- is_final: Whether speech corresponds to the final chunk of data.
- Returns:
- nbest_hypothesis: N-best hypothesis.
- """
- if isinstance(speech, np.ndarray):
- speech = torch.tensor(speech)
- if is_final:
- if self.streaming and speech.size(0) < self.last_chunk_length:
- pad = torch.zeros(
- self.last_chunk_length - speech.size(0), speech.size(1), dtype=speech.dtype
- )
- speech = torch.cat([speech, pad], dim=0) #feats, feats_length = self.apply_frontend(speech, is_final=is_final)
-
- feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
- feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
-
- if self.asr_model.normalize is not None:
- feats, feats_lengths = self.asr_model.normalize(feats, feats_lengths)
-
- feats = to_device(feats, device=self.device)
- feats_lengths = to_device(feats_lengths, device=self.device)
- enc_out = self.asr_model.encoder.chunk_forward(
- feats,
- feats_lengths,
- self.num_processed_frames,
- chunk_size=self.chunk_size,
- left_context=self.left_context,
- right_context=self.right_context,
- )
- nbest_hyps = self.beam_search(enc_out[0], is_final=is_final)
-
- self.num_processed_frames += self.chunk_size
-
- if is_final:
- self.reset_inference_cache()
-
- return nbest_hyps
-
- @torch.no_grad()
- def simu_streaming_decode(self, speech: Union[torch.Tensor, np.ndarray]) -> List[Hypothesis]:
- """Speech2Text call.
- Args:
- speech: Speech data. (S)
- Returns:
- nbest_hypothesis: N-best hypothesis.
- """
- assert check_argument_types()
-
- if isinstance(speech, np.ndarray):
- speech = torch.tensor(speech)
-
- feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
- feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
-
- if self.asr_model.normalize is not None:
- feats, feats_lengths = self.asr_model.normalize(feats, feats_lengths)
-
- feats = to_device(feats, device=self.device)
- feats_lengths = to_device(feats_lengths, device=self.device)
- enc_out = self.asr_model.encoder.simu_chunk_forward(feats, feats_lengths, self.chunk_size, self.left_context, self.right_context)
- nbest_hyps = self.beam_search(enc_out[0])
-
- return nbest_hyps
-
- @torch.no_grad()
- def __call__(self, speech: Union[torch.Tensor, np.ndarray]) -> List[Hypothesis]:
- """Speech2Text call.
- Args:
- speech: Speech data. (S)
- Returns:
- nbest_hypothesis: N-best hypothesis.
- """
- assert check_argument_types()
-
- if isinstance(speech, np.ndarray):
- speech = torch.tensor(speech)
-
- feats = speech.unsqueeze(0).to(getattr(torch, self.dtype))
- feats_lengths = feats.new_full([1], dtype=torch.long, fill_value=feats.size(1))
-
- feats = to_device(feats, device=self.device)
- feats_lengths = to_device(feats_lengths, device=self.device)
-
- enc_out, _ = self.asr_model.encoder(feats, feats_lengths)
-
- nbest_hyps = self.beam_search(enc_out[0])
-
- return nbest_hyps
-
- def hypotheses_to_results(self, nbest_hyps: List[Hypothesis]) -> List[Any]:
- """Build partial or final results from the hypotheses.
- Args:
- nbest_hyps: N-best hypothesis.
- Returns:
- results: Results containing different representation for the hypothesis.
- """
- results = []
-
- for hyp in nbest_hyps:
- token_int = list(filter(lambda x: x != 0, hyp.yseq))
-
- token = self.converter.ids2tokens(token_int)
-
- if self.tokenizer is not None:
- text = self.tokenizer.tokens2text(token)
- else:
- text = None
- results.append((text, token, token_int, hyp))
-
- assert check_return_type(results)
-
- return results
-
- @staticmethod
- def from_pretrained(
- model_tag: Optional[str] = None,
- **kwargs: Optional[Any],
- ) -> Speech2Text:
- """Build Speech2Text instance from the pretrained model.
- Args:
- model_tag: Model tag of the pretrained models.
- Return:
- : Speech2Text instance.
- """
- if model_tag is not None:
- try:
- from espnet_model_zoo.downloader import ModelDownloader
-
- except ImportError:
- logging.error(
- "`espnet_model_zoo` is not installed. "
- "Please install via `pip install -U espnet_model_zoo`."
- )
- raise
- d = ModelDownloader()
- kwargs.update(**d.download_and_unpack(model_tag))
-
- return Speech2Text(**kwargs)
-
-
-def inference(
- output_dir: str,
- batch_size: int,
- dtype: str,
- beam_size: int,
- ngpu: int,
- seed: int,
- lm_weight: float,
- nbest: int,
- num_workers: int,
- log_level: Union[int, str],
- data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
- asr_train_config: Optional[str],
- asr_model_file: Optional[str],
- cmvn_file: Optional[str],
- beam_search_config: Optional[dict],
- lm_train_config: Optional[str],
- lm_file: Optional[str],
- model_tag: Optional[str],
- token_type: Optional[str],
- bpemodel: Optional[str],
- key_file: Optional[str],
- allow_variable_data_keys: bool,
- quantize_asr_model: Optional[bool],
- quantize_modules: Optional[List[str]],
- quantize_dtype: Optional[str],
- streaming: Optional[bool],
- simu_streaming: Optional[bool],
- chunk_size: Optional[int],
- left_context: Optional[int],
- right_context: Optional[int],
- display_partial_hypotheses: bool,
- **kwargs,
-) -> None:
- """Transducer model inference.
- Args:
- output_dir: Output directory path.
- batch_size: Batch decoding size.
- dtype: Data type.
- beam_size: Beam size.
- ngpu: Number of GPUs.
- seed: Random number generator seed.
- lm_weight: Weight of language model.
- nbest: Number of final hypothesis.
- num_workers: Number of workers.
- log_level: Level of verbose for logs.
- data_path_and_name_and_type:
- asr_train_config: ASR model training config path.
- asr_model_file: ASR model path.
- beam_search_config: Beam search config path.
- lm_train_config: Language Model training config path.
- lm_file: Language Model path.
- model_tag: Model tag.
- token_type: Type of token units.
- bpemodel: BPE model path.
- key_file: File key.
- allow_variable_data_keys: Whether to allow variable data keys.
- quantize_asr_model: Whether to apply dynamic quantization to ASR model.
- quantize_modules: List of module names to apply dynamic quantization on.
- quantize_dtype: Dynamic quantization data type.
- streaming: Whether to perform chunk-by-chunk inference.
- chunk_size: Number of frames in chunk AFTER subsampling.
- left_context: Number of frames in left context AFTER subsampling.
- right_context: Number of frames in right context AFTER subsampling.
- display_partial_hypotheses: Whether to display partial hypotheses.
- """
- assert check_argument_types()
-
- if batch_size > 1:
- raise NotImplementedError("batch decoding is not implemented")
- if ngpu > 1:
- raise NotImplementedError("only single GPU decoding is supported")
-
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
-
- if ngpu >= 1:
- device = "cuda"
- else:
- device = "cpu"
- # 1. Set random-seed
- set_all_random_seed(seed)
-
- # 2. Build speech2text
- speech2text_kwargs = dict(
- asr_train_config=asr_train_config,
- asr_model_file=asr_model_file,
- cmvn_file=cmvn_file,
- beam_search_config=beam_search_config,
- lm_train_config=lm_train_config,
- lm_file=lm_file,
- token_type=token_type,
- bpemodel=bpemodel,
- device=device,
- dtype=dtype,
- beam_size=beam_size,
- lm_weight=lm_weight,
- nbest=nbest,
- quantize_asr_model=quantize_asr_model,
- quantize_modules=quantize_modules,
- quantize_dtype=quantize_dtype,
- streaming=streaming,
- simu_streaming=simu_streaming,
- chunk_size=chunk_size,
- left_context=left_context,
- right_context=right_context,
- )
- speech2text = Speech2Text.from_pretrained(
- model_tag=model_tag,
- **speech2text_kwargs,
- )
-
- # 3. Build data-iterator
- loader = ASRTransducerTask.build_streaming_iterator(
- data_path_and_name_and_type,
- dtype=dtype,
- batch_size=batch_size,
- key_file=key_file,
- num_workers=num_workers,
- preprocess_fn=ASRTransducerTask.build_preprocess_fn(
- speech2text.asr_train_args, False
- ),
- collate_fn=ASRTransducerTask.build_collate_fn(
- speech2text.asr_train_args, False
- ),
- allow_variable_data_keys=allow_variable_data_keys,
- inference=True,
- )
-
- # 4 .Start for-loop
- with DatadirWriter(output_dir) as writer:
- for keys, batch in loader:
- assert isinstance(batch, dict), type(batch)
- assert all(isinstance(s, str) for s in keys), keys
-
- _bs = len(next(iter(batch.values())))
- assert len(keys) == _bs, f"{len(keys)} != {_bs}"
- batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
- assert len(batch.keys()) == 1
-
- try:
- if speech2text.streaming:
- speech = batch["speech"]
-
- _steps = len(speech) // speech2text._ctx
- _end = 0
- for i in range(_steps):
- _end = (i + 1) * speech2text._ctx
-
- speech2text.streaming_decode(
- speech[i * speech2text._ctx : _end], is_final=False
- )
-
- final_hyps = speech2text.streaming_decode(
- speech[_end : len(speech)], is_final=True
- )
- elif speech2text.simu_streaming:
- final_hyps = speech2text.simu_streaming_decode(**batch)
- else:
- final_hyps = speech2text(**batch)
-
- results = speech2text.hypotheses_to_results(final_hyps)
- except TooShortUttError as e:
- logging.warning(f"Utterance {keys} {e}")
- hyp = Hypothesis(score=0.0, yseq=[], dec_state=None)
- results = [[" ", [""], [2], hyp]] * nbest
-
- key = keys[0]
- for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
- ibest_writer = writer[f"{n}best_recog"]
-
- ibest_writer["token"][key] = " ".join(token)
- ibest_writer["token_int"][key] = " ".join(map(str, token_int))
- ibest_writer["score"][key] = str(hyp.score)
-
- if text is not None:
- ibest_writer["text"][key] = text
-
-
-def get_parser():
- """Get Transducer model inference parser."""
-
- parser = config_argparse.ArgumentParser(
- description="ASR Transducer Decoding",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- )
-
- parser.add_argument(
- "--log_level",
- type=lambda x: x.upper(),
- default="INFO",
- choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
- help="The verbose level of logging",
- )
-
- parser.add_argument("--output_dir", type=str, required=True)
- parser.add_argument(
- "--ngpu",
- type=int,
- default=0,
- help="The number of gpus. 0 indicates CPU mode",
- )
- parser.add_argument("--seed", type=int, default=0, help="Random seed")
- parser.add_argument(
- "--dtype",
- default="float32",
- choices=["float16", "float32", "float64"],
- help="Data type",
- )
- parser.add_argument(
- "--num_workers",
- type=int,
- default=1,
- help="The number of workers used for DataLoader",
- )
-
- group = parser.add_argument_group("Input data related")
- group.add_argument(
- "--data_path_and_name_and_type",
- type=str2triple_str,
- required=True,
- action="append",
- )
- group.add_argument("--key_file", type=str_or_none)
- group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
-
- group = parser.add_argument_group("The model configuration related")
- group.add_argument(
- "--asr_train_config",
- type=str,
- help="ASR training configuration",
- )
- group.add_argument(
- "--asr_model_file",
- type=str,
- help="ASR model parameter file",
- )
- group.add_argument(
- "--cmvn_file",
- type=str,
- help="Global cmvn file",
- )
- group.add_argument(
- "--lm_train_config",
- type=str,
- help="LM training configuration",
- )
- group.add_argument(
- "--lm_file",
- type=str,
- help="LM parameter file",
- )
- group.add_argument(
- "--model_tag",
- type=str,
- help="Pretrained model tag. If specify this option, *_train_config and "
- "*_file will be overwritten",
- )
-
- group = parser.add_argument_group("Beam-search related")
- group.add_argument(
- "--batch_size",
- type=int,
- default=1,
- help="The batch size for inference",
- )
- group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
- group.add_argument("--beam_size", type=int, default=5, help="Beam size")
- group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
- group.add_argument(
- "--beam_search_config",
- default={},
- help="The keyword arguments for transducer beam search.",
- )
-
- group = parser.add_argument_group("Text converter related")
- group.add_argument(
- "--token_type",
- type=str_or_none,
- default=None,
- choices=["char", "bpe", None],
- help="The token type for ASR model. "
- "If not given, refers from the training args",
- )
- group.add_argument(
- "--bpemodel",
- type=str_or_none,
- default=None,
- help="The model path of sentencepiece. "
- "If not given, refers from the training args",
- )
-
- group = parser.add_argument_group("Dynamic quantization related")
- parser.add_argument(
- "--quantize_asr_model",
- type=bool,
- default=False,
- help="Apply dynamic quantization to ASR model.",
- )
- parser.add_argument(
- "--quantize_modules",
- nargs="*",
- default=None,
- help="""Module names to apply dynamic quantization on.
- The module names are provided as a list, where each name is separated
- by a comma (e.g.: --quantize-config=[Linear,LSTM,GRU]).
- Each specified name should be an attribute of 'torch.nn', e.g.:
- torch.nn.Linear, torch.nn.LSTM, torch.nn.GRU, ...""",
- )
- parser.add_argument(
- "--quantize_dtype",
- type=str,
- default="qint8",
- choices=["float16", "qint8"],
- help="Dtype for dynamic quantization.",
- )
-
- group = parser.add_argument_group("Streaming related")
- parser.add_argument(
- "--streaming",
- type=bool,
- default=False,
- help="Whether to perform chunk-by-chunk inference.",
- )
- parser.add_argument(
- "--simu_streaming",
- type=bool,
- default=False,
- help="Whether to simulate chunk-by-chunk inference.",
- )
- parser.add_argument(
- "--chunk_size",
- type=int,
- default=16,
- help="Number of frames in chunk AFTER subsampling.",
- )
- parser.add_argument(
- "--left_context",
- type=int,
- default=32,
- help="Number of frames in left context of the chunk AFTER subsampling.",
- )
- parser.add_argument(
- "--right_context",
- type=int,
- default=0,
- help="Number of frames in right context of the chunk AFTER subsampling.",
- )
- parser.add_argument(
- "--display_partial_hypotheses",
- type=bool,
- default=False,
- help="Whether to display partial hypotheses during chunk-by-chunk inference.",
- )
-
- return parser
-
-
-def main(cmd=None):
- print(get_commandline_args(), file=sys.stderr)
-
- parser = get_parser()
- args = parser.parse_args(cmd)
- kwargs = vars(args)
-
- kwargs.pop("config", None)
- inference(**kwargs)
-
-
-if __name__ == "__main__":
- main()
-
diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py
deleted file mode 100644
index 35ecdc24b..000000000
--- a/funasr/bin/asr_inference_uniasr.py
+++ /dev/null
@@ -1,694 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import logging
-import sys
-from pathlib import Path
-from typing import List
-from typing import Optional
-from typing import Sequence
-from typing import Tuple
-from typing import Union
-from typing import Dict
-from typing import Any
-
-import numpy as np
-import torch
-from typeguard import check_argument_types
-from typeguard import check_return_type
-
-from funasr.fileio.datadir_writer import DatadirWriter
-from funasr.modules.beam_search.beam_search import BeamSearchScama as BeamSearch
-from funasr.modules.beam_search.beam_search import Hypothesis
-from funasr.modules.scorers.ctc import CTCPrefixScorer
-from funasr.modules.scorers.length_bonus import LengthBonus
-from funasr.modules.subsampling import TooShortUttError
-from funasr.tasks.asr import ASRTaskUniASR as ASRTask
-from funasr.tasks.lm import LMTask
-from funasr.text.build_tokenizer import build_tokenizer
-from funasr.text.token_id_converter import TokenIDConverter
-from funasr.torch_utils.device_funcs import to_device
-from funasr.torch_utils.set_all_random_seed import set_all_random_seed
-from funasr.utils import config_argparse
-from funasr.utils.cli_utils import get_commandline_args
-from funasr.utils.types import str2bool
-from funasr.utils.types import str2triple_str
-from funasr.utils.types import str_or_none
-from funasr.utils import asr_utils, wav_utils, postprocess_utils
-from funasr.models.frontend.wav_frontend import WavFrontend
-
-
-
-class Speech2Text:
- """Speech2Text class
-
- Examples:
- >>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
- >>> audio, rate = soundfile.read("speech.wav")
- >>> speech2text(audio)
- [(text, token, token_int, hypothesis object), ...]
-
- """
-
- def __init__(
- self,
- asr_train_config: Union[Path, str] = None,
- asr_model_file: Union[Path, str] = None,
- cmvn_file: Union[Path, str] = None,
- lm_train_config: Union[Path, str] = None,
- lm_file: Union[Path, str] = None,
- token_type: str = None,
- bpemodel: str = None,
- device: str = "cpu",
- maxlenratio: float = 0.0,
- minlenratio: float = 0.0,
- dtype: str = "float32",
- beam_size: int = 20,
- ctc_weight: float = 0.5,
- lm_weight: float = 1.0,
- ngram_weight: float = 0.9,
- penalty: float = 0.0,
- nbest: int = 1,
- token_num_relax: int = 1,
- decoding_ind: int = 0,
- decoding_mode: str = "model1",
- frontend_conf: dict = None,
- **kwargs,
- ):
- assert check_argument_types()
-
- # 1. Build ASR model
- scorers = {}
- asr_model, asr_train_args = ASRTask.build_model_from_file(
- asr_train_config, asr_model_file, cmvn_file, device
- )
- frontend = None
- if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
- frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
-
- logging.info("asr_train_args: {}".format(asr_train_args))
- asr_model.to(dtype=getattr(torch, dtype)).eval()
- if decoding_mode == "model1":
- decoder = asr_model.decoder
- else:
- decoder = asr_model.decoder2
-
- if asr_model.ctc != None:
- ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
- scorers.update(
- ctc=ctc
- )
- token_list = asr_model.token_list
- scorers.update(
- decoder=decoder,
- length_bonus=LengthBonus(len(token_list)),
- )
-
- # 2. Build Language model
- if lm_train_config is not None:
- lm, lm_train_args = LMTask.build_model_from_file(
- lm_train_config, lm_file, device
- )
- scorers["lm"] = lm.lm
-
- # 3. Build ngram model
- # ngram is not supported now
- ngram = None
- scorers["ngram"] = ngram
-
- # 4. Build BeamSearch object
- # transducer is not supported now
- beam_search_transducer = None
-
- weights = dict(
- decoder=1.0 - ctc_weight,
- ctc=ctc_weight,
- lm=lm_weight,
- ngram=ngram_weight,
- length_bonus=penalty,
- )
- beam_search = BeamSearch(
- beam_size=beam_size,
- weights=weights,
- scorers=scorers,
- sos=asr_model.sos,
- eos=asr_model.eos,
- vocab_size=len(token_list),
- token_list=token_list,
- pre_beam_score_key=None if ctc_weight == 1.0 else "full",
- )
-
- beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
- for scorer in scorers.values():
- if isinstance(scorer, torch.nn.Module):
- scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
- # logging.info(f"Beam_search: {beam_search}")
- logging.info(f"Decoding device={device}, dtype={dtype}")
-
- # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
- if token_type is None:
- token_type = asr_train_args.token_type
- if bpemodel is None:
- bpemodel = asr_train_args.bpemodel
-
- if token_type is None:
- tokenizer = None
- elif token_type == "bpe":
- if bpemodel is not None:
- tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
- else:
- tokenizer = None
- else:
- tokenizer = build_tokenizer(token_type=token_type)
- converter = TokenIDConverter(token_list=token_list)
- logging.info(f"Text tokenizer: {tokenizer}")
-
- self.asr_model = asr_model
- self.asr_train_args = asr_train_args
- self.converter = converter
- self.tokenizer = tokenizer
- self.beam_search = beam_search
- self.beam_search_transducer = beam_search_transducer
- self.maxlenratio = maxlenratio
- self.minlenratio = minlenratio
- self.device = device
- self.dtype = dtype
- self.nbest = nbest
- self.token_num_relax = token_num_relax
- self.decoding_ind = decoding_ind
- self.decoding_mode = decoding_mode
- self.frontend = frontend
-
- @torch.no_grad()
- def __call__(
- self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
- ) -> List[
- Tuple[
- Optional[str],
- List[str],
- List[int],
- Union[Hypothesis],
- ]
- ]:
- """Inference
-
- Args:
- speech: Input speech data
- Returns:
- text, token, token_int, hyp
-
- """
- assert check_argument_types()
-
- # Input as audio signal
- if isinstance(speech, np.ndarray):
- speech = torch.tensor(speech)
-
- if self.frontend is not None:
- feats, feats_len = self.frontend.forward(speech, speech_lengths)
- feats = to_device(feats, device=self.device)
- feats_len = feats_len.int()
- self.asr_model.frontend = None
- else:
- feats = speech
- feats_len = speech_lengths
- lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
- feats_raw = feats.clone().to(self.device)
- batch = {"speech": feats, "speech_lengths": feats_len}
-
- # a. To device
- batch = to_device(batch, device=self.device)
- # b. Forward Encoder
- _, enc, enc_len = self.asr_model.encode(**batch, ind=self.decoding_ind)
- if isinstance(enc, tuple):
- enc = enc[0]
- assert len(enc) == 1, len(enc)
- if self.decoding_mode == "model1":
- predictor_outs = self.asr_model.calc_predictor_mask(enc, enc_len)
- else:
- enc, enc_len = self.asr_model.encode2(enc, enc_len, feats_raw, feats_len, ind=self.decoding_ind)
- predictor_outs = self.asr_model.calc_predictor_mask2(enc, enc_len)
-
- scama_mask = predictor_outs[4]
- pre_token_length = predictor_outs[1]
- pre_acoustic_embeds = predictor_outs[0]
- maxlen = pre_token_length.sum().item() + self.token_num_relax
- minlen = max(0, pre_token_length.sum().item() - self.token_num_relax)
- # c. Passed the encoder result and the beam search
- nbest_hyps = self.beam_search(
- x=enc[0], scama_mask=scama_mask, pre_acoustic_embeds=pre_acoustic_embeds, maxlenratio=self.maxlenratio,
- minlenratio=self.minlenratio, maxlen=int(maxlen), minlen=int(minlen),
- )
-
- nbest_hyps = nbest_hyps[: self.nbest]
-
- results = []
- for hyp in nbest_hyps:
- assert isinstance(hyp, (Hypothesis)), type(hyp)
-
- # remove sos/eos and get results
- last_pos = -1
- if isinstance(hyp.yseq, list):
- token_int = hyp.yseq[1:last_pos]
- else:
- token_int = hyp.yseq[1:last_pos].tolist()
-
- # remove blank symbol id, which is assumed to be 0
- token_int = list(filter(lambda x: x != 0, token_int))
-
- # Change integer-ids to tokens
- token = self.converter.ids2tokens(token_int)
- token = list(filter(lambda x: x != "", token))
-
- if self.tokenizer is not None:
- text = self.tokenizer.tokens2text(token)
- else:
- text = None
- results.append((text, token, token_int, hyp))
-
- assert check_return_type(results)
- return results
-
-
-def inference(
- maxlenratio: float,
- minlenratio: float,
- batch_size: int,
- beam_size: int,
- ngpu: int,
- ctc_weight: float,
- lm_weight: float,
- penalty: float,
- log_level: Union[int, str],
- data_path_and_name_and_type,
- asr_train_config: Optional[str],
- asr_model_file: Optional[str],
- ngram_file: Optional[str] = None,
- cmvn_file: Optional[str] = None,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- lm_train_config: Optional[str] = None,
- lm_file: Optional[str] = None,
- token_type: Optional[str] = None,
- key_file: Optional[str] = None,
- word_lm_train_config: Optional[str] = None,
- bpemodel: Optional[str] = None,
- allow_variable_data_keys: bool = False,
- streaming: bool = False,
- output_dir: Optional[str] = None,
- dtype: str = "float32",
- seed: int = 0,
- ngram_weight: float = 0.9,
- nbest: int = 1,
- num_workers: int = 1,
- token_num_relax: int = 1,
- decoding_ind: int = 0,
- decoding_mode: str = "model1",
- **kwargs,
-):
- inference_pipeline = inference_modelscope(
- maxlenratio=maxlenratio,
- minlenratio=minlenratio,
- batch_size=batch_size,
- beam_size=beam_size,
- ngpu=ngpu,
- ctc_weight=ctc_weight,
- lm_weight=lm_weight,
- penalty=penalty,
- log_level=log_level,
- asr_train_config=asr_train_config,
- asr_model_file=asr_model_file,
- cmvn_file=cmvn_file,
- raw_inputs=raw_inputs,
- lm_train_config=lm_train_config,
- lm_file=lm_file,
- token_type=token_type,
- key_file=key_file,
- word_lm_train_config=word_lm_train_config,
- bpemodel=bpemodel,
- allow_variable_data_keys=allow_variable_data_keys,
- streaming=streaming,
- output_dir=output_dir,
- dtype=dtype,
- seed=seed,
- ngram_weight=ngram_weight,
- ngram_file=ngram_file,
- nbest=nbest,
- num_workers=num_workers,
- token_num_relax=token_num_relax,
- decoding_ind=decoding_ind,
- decoding_mode=decoding_mode,
- **kwargs,
- )
- return inference_pipeline(data_path_and_name_and_type, raw_inputs)
-
-
-def inference_modelscope(
- maxlenratio: float,
- minlenratio: float,
- batch_size: int,
- beam_size: int,
- ngpu: int,
- ctc_weight: float,
- lm_weight: float,
- penalty: float,
- log_level: Union[int, str],
- # data_path_and_name_and_type,
- asr_train_config: Optional[str],
- asr_model_file: Optional[str],
- ngram_file: Optional[str] = None,
- cmvn_file: Optional[str] = None,
- # raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- lm_train_config: Optional[str] = None,
- lm_file: Optional[str] = None,
- token_type: Optional[str] = None,
- key_file: Optional[str] = None,
- word_lm_train_config: Optional[str] = None,
- bpemodel: Optional[str] = None,
- allow_variable_data_keys: bool = False,
- streaming: bool = False,
- output_dir: Optional[str] = None,
- dtype: str = "float32",
- seed: int = 0,
- ngram_weight: float = 0.9,
- nbest: int = 1,
- num_workers: int = 1,
- token_num_relax: int = 1,
- decoding_ind: int = 0,
- decoding_mode: str = "model1",
- param_dict: dict = None,
- **kwargs,
-):
- assert check_argument_types()
- ncpu = kwargs.get("ncpu", 1)
- torch.set_num_threads(ncpu)
- if batch_size > 1:
- raise NotImplementedError("batch decoding is not implemented")
- if word_lm_train_config is not None:
- raise NotImplementedError("Word LM is not implemented")
- if ngpu > 1:
- raise NotImplementedError("only single GPU decoding is supported")
-
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
-
- if ngpu >= 1 and torch.cuda.is_available():
- device = "cuda"
- else:
- device = "cpu"
-
- if param_dict is not None and "decoding_model" in param_dict:
- if param_dict["decoding_model"] == "fast":
- decoding_ind = 0
- decoding_mode = "model1"
- elif param_dict["decoding_model"] == "normal":
- decoding_ind = 0
- decoding_mode = "model2"
- elif param_dict["decoding_model"] == "offline":
- decoding_ind = 1
- decoding_mode = "model2"
- else:
- raise NotImplementedError("unsupported decoding model {}".format(param_dict["decoding_model"]))
-
- # 1. Set random-seed
- set_all_random_seed(seed)
-
- # 2. Build speech2text
- speech2text_kwargs = dict(
- asr_train_config=asr_train_config,
- asr_model_file=asr_model_file,
- cmvn_file=cmvn_file,
- lm_train_config=lm_train_config,
- lm_file=lm_file,
- ngram_file=ngram_file,
- token_type=token_type,
- bpemodel=bpemodel,
- device=device,
- maxlenratio=maxlenratio,
- minlenratio=minlenratio,
- dtype=dtype,
- beam_size=beam_size,
- ctc_weight=ctc_weight,
- lm_weight=lm_weight,
- ngram_weight=ngram_weight,
- penalty=penalty,
- nbest=nbest,
- streaming=streaming,
- token_num_relax=token_num_relax,
- decoding_ind=decoding_ind,
- decoding_mode=decoding_mode,
- )
- speech2text = Speech2Text(**speech2text_kwargs)
-
- def _forward(data_path_and_name_and_type,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- output_dir_v2: Optional[str] = None,
- fs: dict = None,
- param_dict: dict = None,
- **kwargs,
- ):
- # 3. Build data-iterator
- if data_path_and_name_and_type is None and raw_inputs is not None:
- if isinstance(raw_inputs, torch.Tensor):
- raw_inputs = raw_inputs.numpy()
- data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
- loader = ASRTask.build_streaming_iterator(
- data_path_and_name_and_type,
- dtype=dtype,
- fs=fs,
- batch_size=batch_size,
- key_file=key_file,
- num_workers=num_workers,
- preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
- collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
- allow_variable_data_keys=allow_variable_data_keys,
- inference=True,
- )
-
- finish_count = 0
- file_count = 1
- # 7 .Start for-loop
- # FIXME(kamo): The output format should be discussed about
- asr_result_list = []
- output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
- if output_path is not None:
- writer = DatadirWriter(output_path)
- else:
- writer = None
-
- for keys, batch in loader:
- assert isinstance(batch, dict), type(batch)
- assert all(isinstance(s, str) for s in keys), keys
- _bs = len(next(iter(batch.values())))
- assert len(keys) == _bs, f"{len(keys)} != {_bs}"
- #batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
-
- # N-best list of (text, token, token_int, hyp_object)
- try:
- results = speech2text(**batch)
- except TooShortUttError as e:
- logging.warning(f"Utterance {keys} {e}")
- hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
- results = [[" ", ["sil"], [2], hyp]] * nbest
-
- # Only supporting batch_size==1
- key = keys[0]
- logging.info(f"Utterance: {key}")
- for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
- # Create a directory: outdir/{n}best_recog
- if writer is not None:
- ibest_writer = writer[f"{n}best_recog"]
-
- # Write the result to each file
- ibest_writer["token"][key] = " ".join(token)
- # ibest_writer["token_int"][key] = " ".join(map(str, token_int))
- ibest_writer["score"][key] = str(hyp.score)
-
- if text is not None:
- text_postprocessed, word_lists = postprocess_utils.sentence_postprocess(token)
- item = {'key': key, 'value': text_postprocessed}
- asr_result_list.append(item)
- finish_count += 1
- asr_utils.print_progress(finish_count / file_count)
- if writer is not None:
- ibest_writer["text"][key] = " ".join(word_lists)
- return asr_result_list
-
- return _forward
-
-
-
-def get_parser():
- parser = config_argparse.ArgumentParser(
- description="ASR Decoding",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- )
-
- # Note(kamo): Use '_' instead of '-' as separator.
- # '-' is confusing if written in yaml.
- parser.add_argument(
- "--log_level",
- type=lambda x: x.upper(),
- default="INFO",
- choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
- help="The verbose level of logging",
- )
-
- parser.add_argument("--output_dir", type=str, required=True)
- parser.add_argument(
- "--ngpu",
- type=int,
- default=0,
- help="The number of gpus. 0 indicates CPU mode",
- )
- parser.add_argument("--seed", type=int, default=0, help="Random seed")
- parser.add_argument(
- "--dtype",
- default="float32",
- choices=["float16", "float32", "float64"],
- help="Data type",
- )
- parser.add_argument(
- "--num_workers",
- type=int,
- default=1,
- help="The number of workers used for DataLoader",
- )
-
- group = parser.add_argument_group("Input data related")
- group.add_argument(
- "--data_path_and_name_and_type",
- type=str2triple_str,
- required=False,
- action="append",
- )
- group.add_argument("--raw_inputs", type=list, default=None)
- # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
- group.add_argument("--key_file", type=str_or_none)
- group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
-
- group = parser.add_argument_group("The model configuration related")
- group.add_argument(
- "--asr_train_config",
- type=str,
- help="ASR training configuration",
- )
- group.add_argument(
- "--asr_model_file",
- type=str,
- help="ASR model parameter file",
- )
- group.add_argument(
- "--cmvn_file",
- type=str,
- help="Global cmvn file",
- )
- group.add_argument(
- "--lm_train_config",
- type=str,
- help="LM training configuration",
- )
- group.add_argument(
- "--lm_file",
- type=str,
- help="LM parameter file",
- )
- group.add_argument(
- "--word_lm_train_config",
- type=str,
- help="Word LM training configuration",
- )
- group.add_argument(
- "--word_lm_file",
- type=str,
- help="Word LM parameter file",
- )
- group.add_argument(
- "--ngram_file",
- type=str,
- help="N-gram parameter file",
- )
- group.add_argument(
- "--model_tag",
- type=str,
- help="Pretrained model tag. If specify this option, *_train_config and "
- "*_file will be overwritten",
- )
-
- group = parser.add_argument_group("Beam-search related")
- group.add_argument(
- "--batch_size",
- type=int,
- default=1,
- help="The batch size for inference",
- )
- group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
- group.add_argument("--beam_size", type=int, default=20, help="Beam size")
- group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
- group.add_argument(
- "--maxlenratio",
- type=float,
- default=0.0,
- help="Input length ratio to obtain max output length. "
- "If maxlenratio=0.0 (default), it uses a end-detect "
- "function "
- "to automatically find maximum hypothesis lengths."
- "If maxlenratio<0.0, its absolute value is interpreted"
- "as a constant max output length",
- )
- group.add_argument(
- "--minlenratio",
- type=float,
- default=0.0,
- help="Input length ratio to obtain min output length",
- )
- group.add_argument(
- "--ctc_weight",
- type=float,
- default=0.5,
- help="CTC weight in joint decoding",
- )
- group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
- group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
- group.add_argument("--streaming", type=str2bool, default=False)
-
- group = parser.add_argument_group("Text converter related")
- group.add_argument(
- "--token_type",
- type=str_or_none,
- default=None,
- choices=["char", "bpe", None],
- help="The token type for ASR model. "
- "If not given, refers from the training args",
- )
- group.add_argument(
- "--bpemodel",
- type=str_or_none,
- default=None,
- help="The model path of sentencepiece. "
- "If not given, refers from the training args",
- )
- group.add_argument("--token_num_relax", type=int, default=1, help="")
- group.add_argument("--decoding_ind", type=int, default=0, help="")
- group.add_argument("--decoding_mode", type=str, default="model1", help="")
- group.add_argument(
- "--ctc_weight2",
- type=float,
- default=0.0,
- help="CTC weight in joint decoding",
- )
- return parser
-
-
-def main(cmd=None):
- print(get_commandline_args(), file=sys.stderr)
- parser = get_parser()
- args = parser.parse_args(cmd)
- kwargs = vars(args)
- kwargs.pop("config", None)
- inference(**kwargs)
-
-
-if __name__ == "__main__":
- main()
diff --git a/funasr/bin/diar_infer.py b/funasr/bin/diar_infer.py
new file mode 100755
index 000000000..f698a6650
--- /dev/null
+++ b/funasr/bin/diar_infer.py
@@ -0,0 +1,346 @@
+#!/usr/bin/env python3
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+# MIT License (https://opensource.org/licenses/MIT)
+
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Any
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+from collections import OrderedDict
+import numpy as np
+import soundfile
+import torch
+from torch.nn import functional as F
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.tasks.diar import DiarTask
+from funasr.tasks.diar import EENDOLADiarTask
+from funasr.torch_utils.device_funcs import to_device
+from funasr.torch_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import config_argparse
+from funasr.utils.types import str2bool
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+from scipy.ndimage import median_filter
+from funasr.utils.misc import statistic_model_parameters
+from funasr.datasets.iterable_dataset import load_bytes
+from funasr.models.frontend.wav_frontend import WavFrontendMel23
+
+class Speech2DiarizationEEND:
+ """Speech2Diarlization class
+
+ Examples:
+ >>> import soundfile
+ >>> import numpy as np
+ >>> speech2diar = Speech2DiarizationEEND("diar_sond_config.yml", "diar_sond.pb")
+ >>> profile = np.load("profiles.npy")
+ >>> audio, rate = soundfile.read("speech.wav")
+ >>> speech2diar(audio, profile)
+ {"spk1": [(int, int), ...], ...}
+
+ """
+
+ def __init__(
+ self,
+ diar_train_config: Union[Path, str] = None,
+ diar_model_file: Union[Path, str] = None,
+ device: str = "cpu",
+ dtype: str = "float32",
+ ):
+ assert check_argument_types()
+
+ # 1. Build Diarization model
+ diar_model, diar_train_args = EENDOLADiarTask.build_model_from_file(
+ config_file=diar_train_config,
+ model_file=diar_model_file,
+ device=device
+ )
+ frontend = None
+ if diar_train_args.frontend is not None and diar_train_args.frontend_conf is not None:
+ frontend = WavFrontendMel23(**diar_train_args.frontend_conf)
+
+ # set up seed for eda
+ np.random.seed(diar_train_args.seed)
+ torch.manual_seed(diar_train_args.seed)
+ torch.cuda.manual_seed(diar_train_args.seed)
+ os.environ['PYTORCH_SEED'] = str(diar_train_args.seed)
+ logging.info("diar_model: {}".format(diar_model))
+ logging.info("diar_train_args: {}".format(diar_train_args))
+ diar_model.to(dtype=getattr(torch, dtype)).eval()
+
+ self.diar_model = diar_model
+ self.diar_train_args = diar_train_args
+ self.device = device
+ self.dtype = dtype
+ self.frontend = frontend
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ speech: Union[torch.Tensor, np.ndarray],
+ speech_lengths: Union[torch.Tensor, np.ndarray] = None
+ ):
+ """Inference
+
+ Args:
+ speech: Input speech data
+ Returns:
+ diarization results
+
+ """
+ assert check_argument_types()
+ # Input as audio signal
+ if isinstance(speech, np.ndarray):
+ speech = torch.tensor(speech)
+
+ if self.frontend is not None:
+ feats, feats_len = self.frontend.forward(speech, speech_lengths)
+ feats = to_device(feats, device=self.device)
+ feats_len = feats_len.int()
+ self.diar_model.frontend = None
+ else:
+ feats = speech
+ feats_len = speech_lengths
+ batch = {"speech": feats, "speech_lengths": feats_len}
+ batch = to_device(batch, device=self.device)
+ results = self.diar_model.estimate_sequential(**batch)
+
+ return results
+
+ @staticmethod
+ def from_pretrained(
+ model_tag: Optional[str] = None,
+ **kwargs: Optional[Any],
+ ):
+ """Build Speech2Diarization instance from the pretrained model.
+
+ Args:
+ model_tag (Optional[str]): Model tag of the pretrained models.
+ Currently, the tags of espnet_model_zoo are supported.
+
+ Returns:
+ Speech2Diarization: Speech2Diarization instance.
+
+ """
+ if model_tag is not None:
+ try:
+ from espnet_model_zoo.downloader import ModelDownloader
+
+ except ImportError:
+ logging.error(
+ "`espnet_model_zoo` is not installed. "
+ "Please install via `pip install -U espnet_model_zoo`."
+ )
+ raise
+ d = ModelDownloader()
+ kwargs.update(**d.download_and_unpack(model_tag))
+
+ return Speech2DiarizationEEND(**kwargs)
+
+
+class Speech2DiarizationSOND:
+ """Speech2Xvector class
+
+ Examples:
+ >>> import soundfile
+ >>> import numpy as np
+ >>> speech2diar = Speech2DiarizationSOND("diar_sond_config.yml", "diar_sond.pb")
+ >>> profile = np.load("profiles.npy")
+ >>> audio, rate = soundfile.read("speech.wav")
+ >>> speech2diar(audio, profile)
+ {"spk1": [(int, int), ...], ...}
+
+ """
+
+ def __init__(
+ self,
+ diar_train_config: Union[Path, str] = None,
+ diar_model_file: Union[Path, str] = None,
+ device: Union[str, torch.device] = "cpu",
+ batch_size: int = 1,
+ dtype: str = "float32",
+ streaming: bool = False,
+ smooth_size: int = 83,
+ dur_threshold: float = 10,
+ ):
+ assert check_argument_types()
+
+ # TODO: 1. Build Diarization model
+ diar_model, diar_train_args = DiarTask.build_model_from_file(
+ config_file=diar_train_config,
+ model_file=diar_model_file,
+ device=device
+ )
+ logging.info("diar_model: {}".format(diar_model))
+ logging.info("model parameter number: {}".format(statistic_model_parameters(diar_model)))
+ logging.info("diar_train_args: {}".format(diar_train_args))
+ diar_model.to(dtype=getattr(torch, dtype)).eval()
+
+ self.diar_model = diar_model
+ self.diar_train_args = diar_train_args
+ self.token_list = diar_train_args.token_list
+ self.smooth_size = smooth_size
+ self.dur_threshold = dur_threshold
+ self.device = device
+ self.dtype = dtype
+
+ def smooth_multi_labels(self, multi_label):
+ multi_label = median_filter(multi_label, (self.smooth_size, 1), mode="constant", cval=0.0).astype(int)
+ return multi_label
+
+ @staticmethod
+ def calc_spk_turns(label_arr, spk_list):
+ turn_list = []
+ length = label_arr.shape[0]
+ n_spk = label_arr.shape[1]
+ for k in range(n_spk):
+ if spk_list[k] == "None":
+ continue
+ in_utt = False
+ start = 0
+ for i in range(length):
+ if label_arr[i, k] == 1 and in_utt is False:
+ start = i
+ in_utt = True
+ if label_arr[i, k] == 0 and in_utt is True:
+ turn_list.append([spk_list[k], start, i - start])
+ in_utt = False
+ if in_utt:
+ turn_list.append([spk_list[k], start, length - start])
+ return turn_list
+
+ @staticmethod
+ def seq2arr(seq, vec_dim=8):
+ def int2vec(x, vec_dim=8, dtype=np.int):
+ b = ('{:0' + str(vec_dim) + 'b}').format(x)
+ # little-endian order: lower bit first
+ return (np.array(list(b)[::-1]) == '1').astype(dtype)
+
+ # process oov
+ seq = np.array([int(x) for x in seq])
+ new_seq = []
+ for i, x in enumerate(seq):
+ if x < 2 ** vec_dim:
+ new_seq.append(x)
+ else:
+ idx_list = np.where(seq < 2 ** vec_dim)[0]
+ idx = np.abs(idx_list - i).argmin()
+ new_seq.append(seq[idx_list[idx]])
+ return np.row_stack([int2vec(x, vec_dim) for x in new_seq])
+
+ def post_processing(self, raw_logits: torch.Tensor, spk_num: int, output_format: str = "speaker_turn"):
+ logits_idx = raw_logits.argmax(-1) # B, T, vocab_size -> B, T
+ # upsampling outputs to match inputs
+ ut = logits_idx.shape[1] * self.diar_model.encoder.time_ds_ratio
+ logits_idx = F.upsample(
+ logits_idx.unsqueeze(1).float(),
+ size=(ut, ),
+ mode="nearest",
+ ).squeeze(1).long()
+ logits_idx = logits_idx[0].tolist()
+ pse_labels = [self.token_list[x] for x in logits_idx]
+ if output_format == "pse_labels":
+ return pse_labels, None
+
+ multi_labels = self.seq2arr(pse_labels, spk_num)[:, :spk_num] # remove padding speakers
+ multi_labels = self.smooth_multi_labels(multi_labels)
+ if output_format == "binary_labels":
+ return multi_labels, None
+
+ spk_list = ["spk{}".format(i + 1) for i in range(spk_num)]
+ spk_turns = self.calc_spk_turns(multi_labels, spk_list)
+ results = OrderedDict()
+ for spk, st, dur in spk_turns:
+ if spk not in results:
+ results[spk] = []
+ if dur > self.dur_threshold:
+ results[spk].append((st, st+dur))
+
+ # sort segments in start time ascending
+ for spk in results:
+ results[spk] = sorted(results[spk], key=lambda x: x[0])
+
+ return results, pse_labels
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ speech: Union[torch.Tensor, np.ndarray],
+ profile: Union[torch.Tensor, np.ndarray],
+ output_format: str = "speaker_turn"
+ ):
+ """Inference
+
+ Args:
+ speech: Input speech data
+ profile: Speaker profiles
+ Returns:
+ diarization results for each speaker
+
+ """
+ assert check_argument_types()
+ # Input as audio signal
+ if isinstance(speech, np.ndarray):
+ speech = torch.tensor(speech)
+ if isinstance(profile, np.ndarray):
+ profile = torch.tensor(profile)
+
+ # data: (Nsamples,) -> (1, Nsamples)
+ speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+ profile = profile.unsqueeze(0).to(getattr(torch, self.dtype))
+ # lengths: (1,)
+ speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+ profile_lengths = profile.new_full([1], dtype=torch.long, fill_value=profile.size(1))
+ batch = {"speech": speech, "speech_lengths": speech_lengths,
+ "profile": profile, "profile_lengths": profile_lengths}
+ # a. To device
+ batch = to_device(batch, device=self.device)
+
+ logits = self.diar_model.prediction_forward(**batch)
+ results, pse_labels = self.post_processing(logits, profile.shape[1], output_format)
+
+ return results, pse_labels
+
+ @staticmethod
+ def from_pretrained(
+ model_tag: Optional[str] = None,
+ **kwargs: Optional[Any],
+ ):
+ """Build Speech2Xvector instance from the pretrained model.
+
+ Args:
+ model_tag (Optional[str]): Model tag of the pretrained models.
+ Currently, the tags of espnet_model_zoo are supported.
+
+ Returns:
+ Speech2Xvector: Speech2Xvector instance.
+
+ """
+ if model_tag is not None:
+ try:
+ from espnet_model_zoo.downloader import ModelDownloader
+
+ except ImportError:
+ logging.error(
+ "`espnet_model_zoo` is not installed. "
+ "Please install via `pip install -U espnet_model_zoo`."
+ )
+ raise
+ d = ModelDownloader()
+ kwargs.update(**d.download_and_unpack(model_tag))
+
+ return Speech2DiarizationSOND(**kwargs)
+
+
+
+
diff --git a/funasr/bin/diar_inference_launch.py b/funasr/bin/diar_inference_launch.py
index 07974c072..08004e89b 100755
--- a/funasr/bin/diar_inference_launch.py
+++ b/funasr/bin/diar_inference_launch.py
@@ -15,6 +15,352 @@ from funasr.utils.types import str2bool
from funasr.utils.types import str2triple_str
from funasr.utils.types import str_or_none
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Any
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+from collections import OrderedDict
+import numpy as np
+import soundfile
+import torch
+from torch.nn import functional as F
+from typeguard import check_argument_types
+from typeguard import check_return_type
+from scipy.signal import medfilt
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.tasks.diar import DiarTask
+from funasr.tasks.asr import ASRTask
+from funasr.tasks.diar import EENDOLADiarTask
+from funasr.torch_utils.device_funcs import to_device
+from funasr.torch_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import config_argparse
+from funasr.utils.types import str2bool
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+from scipy.ndimage import median_filter
+from funasr.utils.misc import statistic_model_parameters
+from funasr.datasets.iterable_dataset import load_bytes
+from funasr.bin.diar_infer import Speech2DiarizationSOND, Speech2DiarizationEEND
+
+def inference_sond(
+ diar_train_config: str,
+ diar_model_file: str,
+ output_dir: Optional[str] = None,
+ batch_size: int = 1,
+ dtype: str = "float32",
+ ngpu: int = 0,
+ seed: int = 0,
+ num_workers: int = 0,
+ log_level: Union[int, str] = "INFO",
+ key_file: Optional[str] = None,
+ model_tag: Optional[str] = None,
+ allow_variable_data_keys: bool = True,
+ streaming: bool = False,
+ smooth_size: int = 83,
+ dur_threshold: int = 10,
+ out_format: str = "vad",
+ param_dict: Optional[dict] = None,
+ mode: str = "sond",
+ **kwargs,
+):
+ assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+ if batch_size > 1:
+ raise NotImplementedError("batch decoding is not implemented")
+ if ngpu > 1:
+ raise NotImplementedError("only single GPU decoding is supported")
+
+ logging.basicConfig(
+ level=log_level,
+ format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+ )
+ logging.info("param_dict: {}".format(param_dict))
+
+ if ngpu >= 1 and torch.cuda.is_available():
+ device = "cuda"
+ else:
+ device = "cpu"
+
+ # 1. Set random-seed
+ set_all_random_seed(seed)
+
+ # 2a. Build speech2xvec [Optional]
+ if mode == "sond_demo" and param_dict is not None and "extract_profile" in param_dict and param_dict["extract_profile"]:
+ assert "sv_train_config" in param_dict, "sv_train_config must be provided param_dict."
+ assert "sv_model_file" in param_dict, "sv_model_file must be provided in param_dict."
+ sv_train_config = param_dict["sv_train_config"]
+ sv_model_file = param_dict["sv_model_file"]
+ if "model_dir" in param_dict:
+ sv_train_config = os.path.join(param_dict["model_dir"], sv_train_config)
+ sv_model_file = os.path.join(param_dict["model_dir"], sv_model_file)
+ from funasr.bin.sv_infer import Speech2Xvector
+ speech2xvector_kwargs = dict(
+ sv_train_config=sv_train_config,
+ sv_model_file=sv_model_file,
+ device=device,
+ dtype=dtype,
+ streaming=streaming,
+ embedding_node="resnet1_dense"
+ )
+ logging.info("speech2xvector_kwargs: {}".format(speech2xvector_kwargs))
+ speech2xvector = Speech2Xvector.from_pretrained(
+ model_tag=model_tag,
+ **speech2xvector_kwargs,
+ )
+ speech2xvector.sv_model.eval()
+
+ # 2b. Build speech2diar
+ speech2diar_kwargs = dict(
+ diar_train_config=diar_train_config,
+ diar_model_file=diar_model_file,
+ device=device,
+ dtype=dtype,
+ streaming=streaming,
+ smooth_size=smooth_size,
+ dur_threshold=dur_threshold,
+ )
+ logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs))
+ speech2diar = Speech2DiarizationSOND.from_pretrained(
+ model_tag=model_tag,
+ **speech2diar_kwargs,
+ )
+ speech2diar.diar_model.eval()
+
+ def output_results_str(results: dict, uttid: str):
+ rst = []
+ mid = uttid.rsplit("-", 1)[0]
+ for key in results:
+ results[key] = [(x[0]/100, x[1]/100) for x in results[key]]
+ if out_format == "vad":
+ for spk, segs in results.items():
+ rst.append("{} {}".format(spk, segs))
+ else:
+ template = "SPEAKER {} 0 {:.2f} {:.2f} {} "
+ for spk, segs in results.items():
+ rst.extend([template.format(mid, st, ed, spk) for st, ed in segs])
+
+ return "\n".join(rst)
+
+ def _forward(
+ data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None,
+ raw_inputs: List[List[Union[np.ndarray, torch.Tensor, str, bytes]]] = None,
+ output_dir_v2: Optional[str] = None,
+ param_dict: Optional[dict] = None,
+ ):
+ logging.info("param_dict: {}".format(param_dict))
+ if data_path_and_name_and_type is None and raw_inputs is not None:
+ if isinstance(raw_inputs, (list, tuple)):
+ if not isinstance(raw_inputs[0], List):
+ raw_inputs = [raw_inputs]
+
+ assert all([len(example) >= 2 for example in raw_inputs]), \
+ "The length of test case in raw_inputs must larger than 1 (>=2)."
+
+ def prepare_dataset():
+ for idx, example in enumerate(raw_inputs):
+ # read waveform file
+ example = [load_bytes(x) if isinstance(x, bytes) else x
+ for x in example]
+ example = [soundfile.read(x)[0] if isinstance(x, str) else x
+ for x in example]
+ # convert torch tensor to numpy array
+ example = [x.numpy() if isinstance(example[0], torch.Tensor) else x
+ for x in example]
+ speech = example[0]
+ logging.info("Extracting profiles for {} waveforms".format(len(example)-1))
+ profile = [speech2xvector.calculate_embedding(x) for x in example[1:]]
+ profile = torch.cat(profile, dim=0)
+ yield ["test{}".format(idx)], {"speech": [speech], "profile": [profile]}
+
+ loader = prepare_dataset()
+ else:
+ raise TypeError("raw_inputs must be a list or tuple in [speech, profile1, profile2, ...] ")
+ else:
+ # 3. Build data-iterator
+ loader = ASRTask.build_streaming_iterator(
+ data_path_and_name_and_type,
+ dtype=dtype,
+ batch_size=batch_size,
+ key_file=key_file,
+ num_workers=num_workers,
+ preprocess_fn=None,
+ collate_fn=None,
+ allow_variable_data_keys=allow_variable_data_keys,
+ inference=True,
+ )
+
+ # 7. Start for-loop
+ output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
+ if output_path is not None:
+ os.makedirs(output_path, exist_ok=True)
+ output_writer = open("{}/result.txt".format(output_path), "w")
+ pse_label_writer = open("{}/labels.txt".format(output_path), "w")
+ logging.info("Start to diarize...")
+ result_list = []
+ for idx, (keys, batch) in enumerate(loader):
+ assert isinstance(batch, dict), type(batch)
+ assert all(isinstance(s, str) for s in keys), keys
+ _bs = len(next(iter(batch.values())))
+ assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+ batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+
+ results, pse_labels = speech2diar(**batch)
+ # Only supporting batch_size==1
+ key, value = keys[0], output_results_str(results, keys[0])
+ item = {"key": key, "value": value}
+ result_list.append(item)
+ if output_path is not None:
+ output_writer.write(value)
+ output_writer.flush()
+ pse_label_writer.write("{} {}\n".format(key, " ".join(pse_labels)))
+ pse_label_writer.flush()
+
+ if idx % 100 == 0:
+ logging.info("Processing {:5d}: {}".format(idx, key))
+
+ if output_path is not None:
+ output_writer.close()
+ pse_label_writer.close()
+
+ return result_list
+
+ return _forward
+
+def inference_eend(
+ diar_train_config: str,
+ diar_model_file: str,
+ output_dir: Optional[str] = None,
+ batch_size: int = 1,
+ dtype: str = "float32",
+ ngpu: int = 1,
+ num_workers: int = 0,
+ log_level: Union[int, str] = "INFO",
+ key_file: Optional[str] = None,
+ model_tag: Optional[str] = None,
+ allow_variable_data_keys: bool = True,
+ streaming: bool = False,
+ param_dict: Optional[dict] = None,
+ **kwargs,
+):
+ assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+ if batch_size > 1:
+ raise NotImplementedError("batch decoding is not implemented")
+ if ngpu > 1:
+ raise NotImplementedError("only single GPU decoding is supported")
+
+ logging.basicConfig(
+ level=log_level,
+ format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+ )
+ logging.info("param_dict: {}".format(param_dict))
+
+ if ngpu >= 1 and torch.cuda.is_available():
+ device = "cuda"
+ else:
+ device = "cpu"
+
+ # 1. Build speech2diar
+ speech2diar_kwargs = dict(
+ diar_train_config=diar_train_config,
+ diar_model_file=diar_model_file,
+ device=device,
+ dtype=dtype,
+ )
+ logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs))
+ speech2diar = Speech2DiarizationEEND.from_pretrained(
+ model_tag=model_tag,
+ **speech2diar_kwargs,
+ )
+ speech2diar.diar_model.eval()
+
+ def output_results_str(results: dict, uttid: str):
+ rst = []
+ mid = uttid.rsplit("-", 1)[0]
+ for key in results:
+ results[key] = [(x[0] / 100, x[1] / 100) for x in results[key]]
+ template = "SPEAKER {} 0 {:.2f} {:.2f} {} "
+ for spk, segs in results.items():
+ rst.extend([template.format(mid, st, ed, spk) for st, ed in segs])
+
+ return "\n".join(rst)
+
+ def _forward(
+ data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None,
+ raw_inputs: List[List[Union[np.ndarray, torch.Tensor, str, bytes]]] = None,
+ output_dir_v2: Optional[str] = None,
+ param_dict: Optional[dict] = None,
+ ):
+ # 2. Build data-iterator
+ if data_path_and_name_and_type is None and raw_inputs is not None:
+ if isinstance(raw_inputs, torch.Tensor):
+ raw_inputs = raw_inputs.numpy()
+ data_path_and_name_and_type = [raw_inputs[0], "speech", "sound"]
+ loader = EENDOLADiarTask.build_streaming_iterator(
+ data_path_and_name_and_type,
+ dtype=dtype,
+ batch_size=batch_size,
+ key_file=key_file,
+ num_workers=num_workers,
+ preprocess_fn=EENDOLADiarTask.build_preprocess_fn(speech2diar.diar_train_args, False),
+ collate_fn=EENDOLADiarTask.build_collate_fn(speech2diar.diar_train_args, False),
+ allow_variable_data_keys=allow_variable_data_keys,
+ inference=True,
+ )
+
+ # 3. Start for-loop
+ output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
+ if output_path is not None:
+ os.makedirs(output_path, exist_ok=True)
+ output_writer = open("{}/result.txt".format(output_path), "w")
+ result_list = []
+ for keys, batch in loader:
+ assert isinstance(batch, dict), type(batch)
+ assert all(isinstance(s, str) for s in keys), keys
+ _bs = len(next(iter(batch.values())))
+ assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+ # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+
+ results = speech2diar(**batch)
+
+ # post process
+ a = results[0][0].cpu().numpy()
+ a = medfilt(a, (11, 1))
+ rst = []
+ for spkid, frames in enumerate(a.T):
+ frames = np.pad(frames, (1, 1), 'constant')
+ changes, = np.where(np.diff(frames, axis=0) != 0)
+ fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} {:s} "
+ for s, e in zip(changes[::2], changes[1::2]):
+ st = s / 10.
+ dur = (e - s) / 10.
+ rst.append(fmt.format(keys[0], st, dur, "{}_{}".format(keys[0], str(spkid))))
+
+ # Only supporting batch_size==1
+ value = "\n".join(rst)
+ item = {"key": keys[0], "value": value}
+ result_list.append(item)
+ if output_path is not None:
+ output_writer.write(value)
+ output_writer.flush()
+
+ if output_path is not None:
+ output_writer.close()
+
+ return result_list
+
+ return _forward
+
def get_parser():
parser = config_argparse.ArgumentParser(
@@ -127,10 +473,8 @@ def get_parser():
def inference_launch(mode, **kwargs):
if mode == "sond":
- from funasr.bin.sond_inference import inference_modelscope
- return inference_modelscope(mode=mode, **kwargs)
+ return inference_sond(mode=mode, **kwargs)
elif mode == "sond_demo":
- from funasr.bin.sond_inference import inference_modelscope
param_dict = {
"extract_profile": True,
"sv_train_config": "sv.yaml",
@@ -142,10 +486,9 @@ def inference_launch(mode, **kwargs):
kwargs["param_dict"][key] = param_dict[key]
else:
kwargs["param_dict"] = param_dict
- return inference_modelscope(mode=mode, **kwargs)
+ return inference_sond(mode=mode, **kwargs)
elif mode == "eend-ola":
- from funasr.bin.eend_ola_inference import inference_modelscope
- return inference_modelscope(mode=mode, **kwargs)
+ return inference_eend(mode=mode, **kwargs)
else:
logging.info("Unknown decoding mode: {}".format(mode))
return None
@@ -178,7 +521,8 @@ def main(cmd=None):
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
- inference_launch(**kwargs)
+ inference_pipeline = inference_launch(**kwargs)
+ return inference_pipeline(kwargs["data_path_and_name_and_type"])
if __name__ == "__main__":
diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
deleted file mode 100755
index 87816dd22..000000000
--- a/funasr/bin/eend_ola_inference.py
+++ /dev/null
@@ -1,429 +0,0 @@
-#!/usr/bin/env python3
-# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
-# MIT License (https://opensource.org/licenses/MIT)
-
-import argparse
-import logging
-import os
-import sys
-from pathlib import Path
-from typing import Any
-from typing import List
-from typing import Optional
-from typing import Sequence
-from typing import Tuple
-from typing import Union
-
-import numpy as np
-import torch
-from scipy.signal import medfilt
-from typeguard import check_argument_types
-
-from funasr.models.frontend.wav_frontend import WavFrontendMel23
-from funasr.tasks.diar import EENDOLADiarTask
-from funasr.torch_utils.device_funcs import to_device
-from funasr.utils import config_argparse
-from funasr.utils.cli_utils import get_commandline_args
-from funasr.utils.types import str2bool
-from funasr.utils.types import str2triple_str
-from funasr.utils.types import str_or_none
-
-
-class Speech2Diarization:
- """Speech2Diarlization class
-
- Examples:
- >>> import soundfile
- >>> import numpy as np
- >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
- >>> profile = np.load("profiles.npy")
- >>> audio, rate = soundfile.read("speech.wav")
- >>> speech2diar(audio, profile)
- {"spk1": [(int, int), ...], ...}
-
- """
-
- def __init__(
- self,
- diar_train_config: Union[Path, str] = None,
- diar_model_file: Union[Path, str] = None,
- device: str = "cpu",
- dtype: str = "float32",
- ):
- assert check_argument_types()
-
- # 1. Build Diarization model
- diar_model, diar_train_args = EENDOLADiarTask.build_model_from_file(
- config_file=diar_train_config,
- model_file=diar_model_file,
- device=device
- )
- frontend = None
- if diar_train_args.frontend is not None and diar_train_args.frontend_conf is not None:
- frontend = WavFrontendMel23(**diar_train_args.frontend_conf)
-
- # set up seed for eda
- np.random.seed(diar_train_args.seed)
- torch.manual_seed(diar_train_args.seed)
- torch.cuda.manual_seed(diar_train_args.seed)
- os.environ['PYTORCH_SEED'] = str(diar_train_args.seed)
- logging.info("diar_model: {}".format(diar_model))
- logging.info("diar_train_args: {}".format(diar_train_args))
- diar_model.to(dtype=getattr(torch, dtype)).eval()
-
- self.diar_model = diar_model
- self.diar_train_args = diar_train_args
- self.device = device
- self.dtype = dtype
- self.frontend = frontend
-
- @torch.no_grad()
- def __call__(
- self,
- speech: Union[torch.Tensor, np.ndarray],
- speech_lengths: Union[torch.Tensor, np.ndarray] = None
- ):
- """Inference
-
- Args:
- speech: Input speech data
- Returns:
- diarization results
-
- """
- assert check_argument_types()
- # Input as audio signal
- if isinstance(speech, np.ndarray):
- speech = torch.tensor(speech)
-
- if self.frontend is not None:
- feats, feats_len = self.frontend.forward(speech, speech_lengths)
- feats = to_device(feats, device=self.device)
- feats_len = feats_len.int()
- self.diar_model.frontend = None
- else:
- feats = speech
- feats_len = speech_lengths
- batch = {"speech": feats, "speech_lengths": feats_len}
- batch = to_device(batch, device=self.device)
- results = self.diar_model.estimate_sequential(**batch)
-
- return results
-
- @staticmethod
- def from_pretrained(
- model_tag: Optional[str] = None,
- **kwargs: Optional[Any],
- ):
- """Build Speech2Diarization instance from the pretrained model.
-
- Args:
- model_tag (Optional[str]): Model tag of the pretrained models.
- Currently, the tags of espnet_model_zoo are supported.
-
- Returns:
- Speech2Diarization: Speech2Diarization instance.
-
- """
- if model_tag is not None:
- try:
- from espnet_model_zoo.downloader import ModelDownloader
-
- except ImportError:
- logging.error(
- "`espnet_model_zoo` is not installed. "
- "Please install via `pip install -U espnet_model_zoo`."
- )
- raise
- d = ModelDownloader()
- kwargs.update(**d.download_and_unpack(model_tag))
-
- return Speech2Diarization(**kwargs)
-
-
-def inference_modelscope(
- diar_train_config: str,
- diar_model_file: str,
- output_dir: Optional[str] = None,
- batch_size: int = 1,
- dtype: str = "float32",
- ngpu: int = 1,
- num_workers: int = 0,
- log_level: Union[int, str] = "INFO",
- key_file: Optional[str] = None,
- model_tag: Optional[str] = None,
- allow_variable_data_keys: bool = True,
- streaming: bool = False,
- param_dict: Optional[dict] = None,
- **kwargs,
-):
- assert check_argument_types()
- ncpu = kwargs.get("ncpu", 1)
- torch.set_num_threads(ncpu)
- if batch_size > 1:
- raise NotImplementedError("batch decoding is not implemented")
- if ngpu > 1:
- raise NotImplementedError("only single GPU decoding is supported")
-
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
- logging.info("param_dict: {}".format(param_dict))
-
- if ngpu >= 1 and torch.cuda.is_available():
- device = "cuda"
- else:
- device = "cpu"
-
- # 1. Build speech2diar
- speech2diar_kwargs = dict(
- diar_train_config=diar_train_config,
- diar_model_file=diar_model_file,
- device=device,
- dtype=dtype,
- )
- logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs))
- speech2diar = Speech2Diarization.from_pretrained(
- model_tag=model_tag,
- **speech2diar_kwargs,
- )
- speech2diar.diar_model.eval()
-
- def output_results_str(results: dict, uttid: str):
- rst = []
- mid = uttid.rsplit("-", 1)[0]
- for key in results:
- results[key] = [(x[0] / 100, x[1] / 100) for x in results[key]]
- template = "SPEAKER {} 0 {:.2f} {:.2f} {} "
- for spk, segs in results.items():
- rst.extend([template.format(mid, st, ed, spk) for st, ed in segs])
-
- return "\n".join(rst)
-
- def _forward(
- data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None,
- raw_inputs: List[List[Union[np.ndarray, torch.Tensor, str, bytes]]] = None,
- output_dir_v2: Optional[str] = None,
- param_dict: Optional[dict] = None,
- ):
- # 2. Build data-iterator
- if data_path_and_name_and_type is None and raw_inputs is not None:
- if isinstance(raw_inputs, torch.Tensor):
- raw_inputs = raw_inputs.numpy()
- data_path_and_name_and_type = [raw_inputs[0], "speech", "sound"]
- loader = EENDOLADiarTask.build_streaming_iterator(
- data_path_and_name_and_type,
- dtype=dtype,
- batch_size=batch_size,
- key_file=key_file,
- num_workers=num_workers,
- preprocess_fn=EENDOLADiarTask.build_preprocess_fn(speech2diar.diar_train_args, False),
- collate_fn=EENDOLADiarTask.build_collate_fn(speech2diar.diar_train_args, False),
- allow_variable_data_keys=allow_variable_data_keys,
- inference=True,
- )
-
- # 3. Start for-loop
- output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
- if output_path is not None:
- os.makedirs(output_path, exist_ok=True)
- output_writer = open("{}/result.txt".format(output_path), "w")
- result_list = []
- for keys, batch in loader:
- assert isinstance(batch, dict), type(batch)
- assert all(isinstance(s, str) for s in keys), keys
- _bs = len(next(iter(batch.values())))
- assert len(keys) == _bs, f"{len(keys)} != {_bs}"
- # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
-
- results = speech2diar(**batch)
-
- # post process
- a = results[0][0].cpu().numpy()
- a = medfilt(a, (11, 1))
- rst = []
- for spkid, frames in enumerate(a.T):
- frames = np.pad(frames, (1, 1), 'constant')
- changes, = np.where(np.diff(frames, axis=0) != 0)
- fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} {:s} "
- for s, e in zip(changes[::2], changes[1::2]):
- st = s / 10.
- dur = (e - s) / 10.
- rst.append(fmt.format(keys[0], st, dur, "{}_{}".format(keys[0], str(spkid))))
-
- # Only supporting batch_size==1
- value = "\n".join(rst)
- item = {"key": keys[0], "value": value}
- result_list.append(item)
- if output_path is not None:
- output_writer.write(value)
- output_writer.flush()
-
- if output_path is not None:
- output_writer.close()
-
- return result_list
-
- return _forward
-
-
-def inference(
- data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
- diar_train_config: Optional[str],
- diar_model_file: Optional[str],
- output_dir: Optional[str] = None,
- batch_size: int = 1,
- dtype: str = "float32",
- ngpu: int = 0,
- seed: int = 0,
- num_workers: int = 1,
- log_level: Union[int, str] = "INFO",
- key_file: Optional[str] = None,
- model_tag: Optional[str] = None,
- allow_variable_data_keys: bool = True,
- streaming: bool = False,
- smooth_size: int = 83,
- dur_threshold: int = 10,
- out_format: str = "vad",
- **kwargs,
-):
- inference_pipeline = inference_modelscope(
- diar_train_config=diar_train_config,
- diar_model_file=diar_model_file,
- output_dir=output_dir,
- batch_size=batch_size,
- dtype=dtype,
- ngpu=ngpu,
- seed=seed,
- num_workers=num_workers,
- log_level=log_level,
- key_file=key_file,
- model_tag=model_tag,
- allow_variable_data_keys=allow_variable_data_keys,
- streaming=streaming,
- smooth_size=smooth_size,
- dur_threshold=dur_threshold,
- out_format=out_format,
- **kwargs,
- )
-
- return inference_pipeline(data_path_and_name_and_type, raw_inputs=None)
-
-
-def get_parser():
- parser = config_argparse.ArgumentParser(
- description="Speaker verification/x-vector extraction",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- )
-
- # Note(kamo): Use '_' instead of '-' as separator.
- # '-' is confusing if written in yaml.
- parser.add_argument(
- "--log_level",
- type=lambda x: x.upper(),
- default="INFO",
- choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
- help="The verbose level of logging",
- )
-
- parser.add_argument("--output_dir", type=str, required=False)
- parser.add_argument(
- "--ngpu",
- type=int,
- default=0,
- help="The number of gpus. 0 indicates CPU mode",
- )
- parser.add_argument(
- "--gpuid_list",
- type=str,
- default="",
- help="The visible gpus",
- )
- parser.add_argument("--seed", type=int, default=0, help="Random seed")
- parser.add_argument(
- "--dtype",
- default="float32",
- choices=["float16", "float32", "float64"],
- help="Data type",
- )
- parser.add_argument(
- "--num_workers",
- type=int,
- default=1,
- help="The number of workers used for DataLoader",
- )
-
- group = parser.add_argument_group("Input data related")
- group.add_argument(
- "--data_path_and_name_and_type",
- type=str2triple_str,
- required=False,
- action="append",
- )
- group.add_argument("--key_file", type=str_or_none)
- group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
-
- group = parser.add_argument_group("The model configuration related")
- group.add_argument(
- "--diar_train_config",
- type=str,
- help="diarization training configuration",
- )
- group.add_argument(
- "--diar_model_file",
- type=str,
- help="diarization model parameter file",
- )
- group.add_argument(
- "--dur_threshold",
- type=int,
- default=10,
- help="The threshold for short segments in number frames"
- )
- parser.add_argument(
- "--smooth_size",
- type=int,
- default=83,
- help="The smoothing window length in number frames"
- )
- group.add_argument(
- "--model_tag",
- type=str,
- help="Pretrained model tag. If specify this option, *_train_config and "
- "*_file will be overwritten",
- )
- parser.add_argument(
- "--batch_size",
- type=int,
- default=1,
- help="The batch size for inference",
- )
- parser.add_argument("--streaming", type=str2bool, default=False)
-
- return parser
-
-
-def main(cmd=None):
- print(get_commandline_args(), file=sys.stderr)
- parser = get_parser()
- args = parser.parse_args(cmd)
- kwargs = vars(args)
- kwargs.pop("config", None)
- logging.info("args: {}".format(kwargs))
- if args.output_dir is None:
- jobid, n_gpu = 1, 1
- gpuid = args.gpuid_list.split(",")[jobid - 1]
- else:
- jobid = int(args.output_dir.split(".")[-1])
- n_gpu = len(args.gpuid_list.split(","))
- gpuid = args.gpuid_list.split(",")[(jobid - 1) % n_gpu]
- os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
- os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
- results_list = inference(**kwargs)
- for results in results_list:
- print("{} {}".format(results["key"], results["value"]))
-
-
-if __name__ == "__main__":
- main()
diff --git a/funasr/bin/modelscope_infer.py b/funasr/bin/modelscope_infer.py
deleted file mode 100755
index bc24340b5..000000000
--- a/funasr/bin/modelscope_infer.py
+++ /dev/null
@@ -1,90 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import logging
-import os
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == '__main__':
- parser = argparse.ArgumentParser(
- description="decoding configs",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- )
- parser.add_argument("--model_name",
- type=str,
- default="speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
- help="model name in modelscope")
- parser.add_argument("--model_revision",
- type=str,
- default="v1.0.4",
- help="model revision in modelscope")
- parser.add_argument("--local_model_path",
- type=str,
- default=None,
- help="local model path, usually for fine-tuning")
- parser.add_argument("--wav_list",
- type=str,
- help="input wav list")
- parser.add_argument("--output_file",
- type=str,
- help="saving decoding results")
- parser.add_argument(
- "--njob",
- type=int,
- default=1,
- help="The number of jobs for each gpu",
- )
- parser.add_argument(
- "--gpuid_list",
- type=str,
- default="",
- help="The visible gpus",
- )
- parser.add_argument(
- "--ngpu",
- type=int,
- default=0,
- help="The number of gpus. 0 indicates CPU mode",
- )
- args = parser.parse_args()
-
- # set logging messages
- logging.basicConfig(
- level=logging.INFO,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
- logging.info("Decoding args: {}".format(args))
-
- # gpu setting
- if args.ngpu > 0:
- jobid = int(args.output_file.split(".")[-1])
- gpuid = args.gpuid_list.split(",")[(jobid - 1) // args.njob]
- os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
- os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
-
- if args.local_model_path is None:
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model="damo/{}".format(args.model_name),
- model_revision=args.model_revision)
- else:
- inference_pipeline = pipeline(
- task=Tasks.auto_speech_recognition,
- model=args.local_model_path)
-
-
- with open(args.wav_list, 'r') as f_wav:
- wav_lines = f_wav.readlines()
-
- with open(args.output_file, "w") as f_out:
- for line in wav_lines:
- wav_id, wav_path = line.strip().split()
- logging.info("decoding, utt_id: ['{}']".format(wav_id))
- rec_result = inference_pipeline(audio_in=wav_path)
- if 'text' in rec_result:
- text = rec_result["text"]
- else:
- text = ''
- f_out.write(wav_id + " " + text + "\n")
- logging.info("best hypo: {} \n".format(text))
diff --git a/funasr/bin/punctuation_infer_vadrealtime.py b/funasr/bin/punc_infer.py
similarity index 53%
rename from funasr/bin/punctuation_infer_vadrealtime.py
rename to funasr/bin/punc_infer.py
index 0dc01f531..41c4da323 100644
--- a/funasr/bin/punctuation_infer_vadrealtime.py
+++ b/funasr/bin/punc_infer.py
@@ -61,16 +61,10 @@ class Text2Punc:
text_name="text",
non_linguistic_symbols=train_args.non_linguistic_symbols,
)
-
@torch.no_grad()
- def __call__(self, text: Union[list, str], cache: list, split_size=20):
- if cache is not None and len(cache) > 0:
- precache = "".join(cache)
- else:
- precache = ""
- cache = []
- data = {"text": precache + " " + text}
+ def __call__(self, text: Union[list, str], split_size=20):
+ data = {"text": text}
result = self.preprocessor(data=data, uid="12938712838719")
split_text = self.preprocessor.pop_split_text_data(result)
mini_sentences = split_to_mini_sentence(split_text, split_size)
@@ -78,10 +72,9 @@ class Text2Punc:
assert len(mini_sentences) == len(mini_sentences_id)
cache_sent = []
cache_sent_id = torch.from_numpy(np.array([], dtype='int32'))
- sentence_punc_list = []
- sentence_words_list= []
+ new_mini_sentence = ""
+ new_mini_sentence_punc = []
cache_pop_trigger_limit = 200
- skip_num = 0
for mini_sentence_i in range(len(mini_sentences)):
mini_sentence = mini_sentences[mini_sentence_i]
mini_sentence_id = mini_sentences_id[mini_sentence_i]
@@ -90,7 +83,6 @@ class Text2Punc:
data = {
"text": torch.unsqueeze(torch.from_numpy(mini_sentence_id), 0),
"text_lengths": torch.from_numpy(np.array([len(mini_sentence_id)], dtype='int32')),
- "vad_indexes": torch.from_numpy(np.array([len(cache)], dtype='int32')),
}
data = to_device(data, self.device)
y, _ = self.wrapped_model(**data)
@@ -110,7 +102,7 @@ class Text2Punc:
break
if last_comma_index < 0 and self.punc_list[punctuations[i]] == ",":
last_comma_index = i
-
+
if sentenceEnd < 0 and len(mini_sentence) > cache_pop_trigger_limit and last_comma_index >= 0:
# The sentence it too long, cut off at a comma.
sentenceEnd = last_comma_index
@@ -120,10 +112,130 @@ class Text2Punc:
mini_sentence = mini_sentence[0:sentenceEnd + 1]
punctuations = punctuations[0:sentenceEnd + 1]
+ # if len(punctuations) == 0:
+ # continue
+
+ punctuations_np = punctuations.cpu().numpy()
+ new_mini_sentence_punc += [int(x) for x in punctuations_np]
+ words_with_punc = []
+ for i in range(len(mini_sentence)):
+ if i > 0:
+ if len(mini_sentence[i][0].encode()) == 1 and len(mini_sentence[i - 1][0].encode()) == 1:
+ mini_sentence[i] = " " + mini_sentence[i]
+ words_with_punc.append(mini_sentence[i])
+ if self.punc_list[punctuations[i]] != "_":
+ words_with_punc.append(self.punc_list[punctuations[i]])
+ new_mini_sentence += "".join(words_with_punc)
+ # Add Period for the end of the sentence
+ new_mini_sentence_out = new_mini_sentence
+ new_mini_sentence_punc_out = new_mini_sentence_punc
+ if mini_sentence_i == len(mini_sentences) - 1:
+ if new_mini_sentence[-1] == "," or new_mini_sentence[-1] == "、":
+ new_mini_sentence_out = new_mini_sentence[:-1] + "。"
+ new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period]
+ elif new_mini_sentence[-1] != "。" and new_mini_sentence[-1] != "?":
+ new_mini_sentence_out = new_mini_sentence + "。"
+ new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period]
+ return new_mini_sentence_out, new_mini_sentence_punc_out
+
+
+class Text2PuncVADRealtime:
+
+ def __init__(
+ self,
+ train_config: Optional[str],
+ model_file: Optional[str],
+ device: str = "cpu",
+ dtype: str = "float32",
+ ):
+ # Build Model
+ model, train_args = PunctuationTask.build_model_from_file(train_config, model_file, device)
+ self.device = device
+ # Wrape model to make model.nll() data-parallel
+ self.wrapped_model = ForwardAdaptor(model, "inference")
+ self.wrapped_model.to(dtype=getattr(torch, dtype)).to(device=device).eval()
+ # logging.info(f"Model:\n{model}")
+ self.punc_list = train_args.punc_list
+ self.period = 0
+ for i in range(len(self.punc_list)):
+ if self.punc_list[i] == ",":
+ self.punc_list[i] = ","
+ elif self.punc_list[i] == "?":
+ self.punc_list[i] = "?"
+ elif self.punc_list[i] == "。":
+ self.period = i
+ self.preprocessor = CodeMixTokenizerCommonPreprocessor(
+ train=False,
+ token_type=train_args.token_type,
+ token_list=train_args.token_list,
+ bpemodel=train_args.bpemodel,
+ text_cleaner=train_args.cleaner,
+ g2p_type=train_args.g2p,
+ text_name="text",
+ non_linguistic_symbols=train_args.non_linguistic_symbols,
+ )
+
+ @torch.no_grad()
+ def __call__(self, text: Union[list, str], cache: list, split_size=20):
+ if cache is not None and len(cache) > 0:
+ precache = "".join(cache)
+ else:
+ precache = ""
+ cache = []
+ data = {"text": precache + " " + text}
+ result = self.preprocessor(data=data, uid="12938712838719")
+ split_text = self.preprocessor.pop_split_text_data(result)
+ mini_sentences = split_to_mini_sentence(split_text, split_size)
+ mini_sentences_id = split_to_mini_sentence(data["text"], split_size)
+ assert len(mini_sentences) == len(mini_sentences_id)
+ cache_sent = []
+ cache_sent_id = torch.from_numpy(np.array([], dtype='int32'))
+ sentence_punc_list = []
+ sentence_words_list = []
+ cache_pop_trigger_limit = 200
+ skip_num = 0
+ for mini_sentence_i in range(len(mini_sentences)):
+ mini_sentence = mini_sentences[mini_sentence_i]
+ mini_sentence_id = mini_sentences_id[mini_sentence_i]
+ mini_sentence = cache_sent + mini_sentence
+ mini_sentence_id = np.concatenate((cache_sent_id, mini_sentence_id), axis=0)
+ data = {
+ "text": torch.unsqueeze(torch.from_numpy(mini_sentence_id), 0),
+ "text_lengths": torch.from_numpy(np.array([len(mini_sentence_id)], dtype='int32')),
+ "vad_indexes": torch.from_numpy(np.array([len(cache)], dtype='int32')),
+ }
+ data = to_device(data, self.device)
+ y, _ = self.wrapped_model(**data)
+ _, indices = y.view(-1, y.shape[-1]).topk(1, dim=1)
+ punctuations = indices
+ if indices.size()[0] != 1:
+ punctuations = torch.squeeze(indices)
+ assert punctuations.size()[0] == len(mini_sentence)
+
+ # Search for the last Period/QuestionMark as cache
+ if mini_sentence_i < len(mini_sentences) - 1:
+ sentenceEnd = -1
+ last_comma_index = -1
+ for i in range(len(punctuations) - 2, 1, -1):
+ if self.punc_list[punctuations[i]] == "。" or self.punc_list[punctuations[i]] == "?":
+ sentenceEnd = i
+ break
+ if last_comma_index < 0 and self.punc_list[punctuations[i]] == ",":
+ last_comma_index = i
+
+ if sentenceEnd < 0 and len(mini_sentence) > cache_pop_trigger_limit and last_comma_index >= 0:
+ # The sentence it too long, cut off at a comma.
+ sentenceEnd = last_comma_index
+ punctuations[sentenceEnd] = self.period
+ cache_sent = mini_sentence[sentenceEnd + 1:]
+ cache_sent_id = mini_sentence_id[sentenceEnd + 1:]
+ mini_sentence = mini_sentence[0:sentenceEnd + 1]
+ punctuations = punctuations[0:sentenceEnd + 1]
+
punctuations_np = punctuations.cpu().numpy()
sentence_punc_list += [self.punc_list[int(x)] for x in punctuations_np]
sentence_words_list += mini_sentence
-
+
assert len(sentence_punc_list) == len(sentence_words_list)
words_with_punc = []
sentence_punc_list_out = []
@@ -140,172 +252,16 @@ class Text2Punc:
if sentence_punc_list[i] != "_":
words_with_punc.append(sentence_punc_list[i])
sentence_out = "".join(words_with_punc)
-
+
sentenceEnd = -1
for i in range(len(sentence_punc_list) - 2, 1, -1):
if sentence_punc_list[i] == "。" or sentence_punc_list[i] == "?":
- sentenceEnd = i
- break
- cache_out = sentence_words_list[sentenceEnd + 1 :]
+ sentenceEnd = i
+ break
+ cache_out = sentence_words_list[sentenceEnd + 1:]
if sentence_out[-1] in self.punc_list:
sentence_out = sentence_out[:-1]
sentence_punc_list_out[-1] = "_"
return sentence_out, sentence_punc_list_out, cache_out
-def inference(
- batch_size: int,
- dtype: str,
- ngpu: int,
- seed: int,
- num_workers: int,
- output_dir: str,
- log_level: Union[int, str],
- train_config: Optional[str],
- model_file: Optional[str],
- key_file: Optional[str] = None,
- data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None,
- raw_inputs: Union[List[Any], bytes, str] = None,
- cache: List[Any] = None,
- param_dict: dict = None,
- **kwargs,
-):
- inference_pipeline = inference_modelscope(
- output_dir=output_dir,
- batch_size=batch_size,
- dtype=dtype,
- ngpu=ngpu,
- seed=seed,
- num_workers=num_workers,
- log_level=log_level,
- key_file=key_file,
- train_config=train_config,
- model_file=model_file,
- param_dict=param_dict,
- **kwargs,
- )
- return inference_pipeline(data_path_and_name_and_type, raw_inputs, cache)
-
-
-def inference_modelscope(
- batch_size: int,
- dtype: str,
- ngpu: int,
- seed: int,
- num_workers: int,
- log_level: Union[int, str],
- #cache: list,
- key_file: Optional[str],
- train_config: Optional[str],
- model_file: Optional[str],
- output_dir: Optional[str] = None,
- param_dict: dict = None,
- **kwargs,
-):
- assert check_argument_types()
- ncpu = kwargs.get("ncpu", 1)
- torch.set_num_threads(ncpu)
-
- if ngpu >= 1 and torch.cuda.is_available():
- device = "cuda"
- else:
- device = "cpu"
-
- # 1. Set random-seed
- set_all_random_seed(seed)
- text2punc = Text2Punc(train_config, model_file, device)
-
- def _forward(
- data_path_and_name_and_type,
- raw_inputs: Union[List[Any], bytes, str] = None,
- output_dir_v2: Optional[str] = None,
- cache: List[Any] = None,
- param_dict: dict = None,
- ):
- results = []
- split_size = 10
- cache_in = param_dict["cache"]
- if raw_inputs != None:
- line = raw_inputs.strip()
- key = "demo"
- if line == "":
- item = {'key': key, 'value': ""}
- results.append(item)
- return results
- result, _, cache = text2punc(line, cache_in)
- param_dict["cache"] = cache
- item = {'key': key, 'value': result}
- results.append(item)
- return results
-
- return results
-
- return _forward
-
-
-def get_parser():
- parser = config_argparse.ArgumentParser(
- description="Punctuation inference",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- )
-
- parser.add_argument(
- "--log_level",
- type=lambda x: x.upper(),
- default="INFO",
- choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
- help="The verbose level of logging",
- )
-
- parser.add_argument("--output_dir", type=str, required=False)
- parser.add_argument(
- "--ngpu",
- type=int,
- default=0,
- help="The number of gpus. 0 indicates CPU mode",
- )
- parser.add_argument("--seed", type=int, default=0, help="Random seed")
- parser.add_argument(
- "--dtype",
- default="float32",
- choices=["float16", "float32", "float64"],
- help="Data type",
- )
- parser.add_argument(
- "--num_workers",
- type=int,
- default=1,
- help="The number of workers used for DataLoader",
- )
- parser.add_argument(
- "--batch_size",
- type=int,
- default=1,
- help="The batch size for inference",
- )
-
- group = parser.add_argument_group("Input data related")
- group.add_argument("--data_path_and_name_and_type", type=str2triple_str, action="append", required=False)
- group.add_argument("--raw_inputs", type=str, required=False)
- group.add_argument("--cache", type=list, required=False)
- group.add_argument("--param_dict", type=dict, required=False)
- group.add_argument("--key_file", type=str_or_none)
-
- group = parser.add_argument_group("The model configuration related")
- group.add_argument("--train_config", type=str)
- group.add_argument("--model_file", type=str)
-
- return parser
-
-
-def main(cmd=None):
- print(get_commandline_args(), file=sys.stderr)
- parser = get_parser()
- args = parser.parse_args(cmd)
- kwargs = vars(args)
- # kwargs.pop("config", None)
- inference(**kwargs)
-
-
-if __name__ == "__main__":
- main()
diff --git a/funasr/bin/punc_inference_launch.py b/funasr/bin/punc_inference_launch.py
index b1d923553..594a7be21 100755
--- a/funasr/bin/punc_inference_launch.py
+++ b/funasr/bin/punc_inference_launch.py
@@ -14,6 +14,166 @@ from funasr.utils.types import str2triple_str
from funasr.utils.types import str_or_none
from funasr.utils.types import float_or_none
+import argparse
+import logging
+from pathlib import Path
+import sys
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+from typing import Any
+from typing import List
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+
+from funasr.datasets.preprocessor import CodeMixTokenizerCommonPreprocessor
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.tasks.punctuation import PunctuationTask
+from funasr.torch_utils.device_funcs import to_device
+from funasr.torch_utils.forward_adaptor import ForwardAdaptor
+from funasr.torch_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import config_argparse
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+from funasr.datasets.preprocessor import split_to_mini_sentence
+from funasr.bin.punc_infer import Text2Punc, Text2PuncVADRealtime
+
+def inference_punc(
+ batch_size: int,
+ dtype: str,
+ ngpu: int,
+ seed: int,
+ num_workers: int,
+ log_level: Union[int, str],
+ key_file: Optional[str],
+ train_config: Optional[str],
+ model_file: Optional[str],
+ output_dir: Optional[str] = None,
+ param_dict: dict = None,
+ **kwargs,
+):
+ assert check_argument_types()
+ logging.basicConfig(
+ level=log_level,
+ format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+ )
+
+ if ngpu >= 1 and torch.cuda.is_available():
+ device = "cuda"
+ else:
+ device = "cpu"
+
+ # 1. Set random-seed
+ set_all_random_seed(seed)
+ text2punc = Text2Punc(train_config, model_file, device)
+
+ def _forward(
+ data_path_and_name_and_type,
+ raw_inputs: Union[List[Any], bytes, str] = None,
+ output_dir_v2: Optional[str] = None,
+ cache: List[Any] = None,
+ param_dict: dict = None,
+ ):
+ results = []
+ split_size = 20
+
+ if raw_inputs != None:
+ line = raw_inputs.strip()
+ key = "demo"
+ if line == "":
+ item = {'key': key, 'value': ""}
+ results.append(item)
+ return results
+ result, _ = text2punc(line)
+ item = {'key': key, 'value': result}
+ results.append(item)
+ return results
+
+ for inference_text, _, _ in data_path_and_name_and_type:
+ with open(inference_text, "r", encoding="utf-8") as fin:
+ for line in fin:
+ line = line.strip()
+ segs = line.split("\t")
+ if len(segs) != 2:
+ continue
+ key = segs[0]
+ if len(segs[1]) == 0:
+ continue
+ result, _ = text2punc(segs[1])
+ item = {'key': key, 'value': result}
+ results.append(item)
+ output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
+ if output_path != None:
+ output_file_name = "infer.out"
+ Path(output_path).mkdir(parents=True, exist_ok=True)
+ output_file_path = (Path(output_path) / output_file_name).absolute()
+ with open(output_file_path, "w", encoding="utf-8") as fout:
+ for item_i in results:
+ key_out = item_i["key"]
+ value_out = item_i["value"]
+ fout.write(f"{key_out}\t{value_out}\n")
+ return results
+
+ return _forward
+
+def inference_punc_vad_realtime(
+ batch_size: int,
+ dtype: str,
+ ngpu: int,
+ seed: int,
+ num_workers: int,
+ log_level: Union[int, str],
+ #cache: list,
+ key_file: Optional[str],
+ train_config: Optional[str],
+ model_file: Optional[str],
+ output_dir: Optional[str] = None,
+ param_dict: dict = None,
+ **kwargs,
+):
+ assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+
+ if ngpu >= 1 and torch.cuda.is_available():
+ device = "cuda"
+ else:
+ device = "cpu"
+
+ # 1. Set random-seed
+ set_all_random_seed(seed)
+ text2punc = Text2PuncVADRealtime(train_config, model_file, device)
+
+ def _forward(
+ data_path_and_name_and_type,
+ raw_inputs: Union[List[Any], bytes, str] = None,
+ output_dir_v2: Optional[str] = None,
+ cache: List[Any] = None,
+ param_dict: dict = None,
+ ):
+ results = []
+ split_size = 10
+ cache_in = param_dict["cache"]
+ if raw_inputs != None:
+ line = raw_inputs.strip()
+ key = "demo"
+ if line == "":
+ item = {'key': key, 'value': ""}
+ results.append(item)
+ return results
+ result, _, cache = text2punc(line, cache_in)
+ param_dict["cache"] = cache
+ item = {'key': key, 'value': result}
+ results.append(item)
+ return results
+
+ return results
+
+ return _forward
+
def get_parser():
parser = config_argparse.ArgumentParser(
@@ -72,11 +232,9 @@ def get_parser():
def inference_launch(mode, **kwargs):
if mode == "punc":
- from funasr.bin.punctuation_infer import inference_modelscope
- return inference_modelscope(**kwargs)
+ return inference_punc(**kwargs)
if mode == "punc_VadRealtime":
- from funasr.bin.punctuation_infer_vadrealtime import inference_modelscope
- return inference_modelscope(**kwargs)
+ return inference_punc_vad_realtime(**kwargs)
else:
logging.info("Unknown decoding mode: {}".format(mode))
return None
@@ -105,7 +263,9 @@ def main(cmd=None):
kwargs.pop("gpuid_list", None)
kwargs.pop("njob", None)
- results = inference_launch(**kwargs)
+ inference_pipeline = inference_launch(**kwargs)
+ return inference_pipeline(kwargs["data_path_and_name_and_type"])
+
if __name__ == "__main__":
diff --git a/funasr/bin/punctuation_infer.py b/funasr/bin/punctuation_infer.py
deleted file mode 100644
index 077814d4f..000000000
--- a/funasr/bin/punctuation_infer.py
+++ /dev/null
@@ -1,320 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import logging
-from pathlib import Path
-import sys
-from typing import Optional
-from typing import Sequence
-from typing import Tuple
-from typing import Union
-from typing import Any
-from typing import List
-
-import numpy as np
-import torch
-from typeguard import check_argument_types
-
-from funasr.datasets.preprocessor import CodeMixTokenizerCommonPreprocessor
-from funasr.utils.cli_utils import get_commandline_args
-from funasr.tasks.punctuation import PunctuationTask
-from funasr.torch_utils.device_funcs import to_device
-from funasr.torch_utils.forward_adaptor import ForwardAdaptor
-from funasr.torch_utils.set_all_random_seed import set_all_random_seed
-from funasr.utils import config_argparse
-from funasr.utils.types import str2triple_str
-from funasr.utils.types import str_or_none
-from funasr.datasets.preprocessor import split_to_mini_sentence
-
-
-class Text2Punc:
-
- def __init__(
- self,
- train_config: Optional[str],
- model_file: Optional[str],
- device: str = "cpu",
- dtype: str = "float32",
- ):
- # Build Model
- model, train_args = PunctuationTask.build_model_from_file(train_config, model_file, device)
- self.device = device
- # Wrape model to make model.nll() data-parallel
- self.wrapped_model = ForwardAdaptor(model, "inference")
- self.wrapped_model.to(dtype=getattr(torch, dtype)).to(device=device).eval()
- # logging.info(f"Model:\n{model}")
- self.punc_list = train_args.punc_list
- self.period = 0
- for i in range(len(self.punc_list)):
- if self.punc_list[i] == ",":
- self.punc_list[i] = ","
- elif self.punc_list[i] == "?":
- self.punc_list[i] = "?"
- elif self.punc_list[i] == "。":
- self.period = i
- self.preprocessor = CodeMixTokenizerCommonPreprocessor(
- train=False,
- token_type=train_args.token_type,
- token_list=train_args.token_list,
- bpemodel=train_args.bpemodel,
- text_cleaner=train_args.cleaner,
- g2p_type=train_args.g2p,
- text_name="text",
- non_linguistic_symbols=train_args.non_linguistic_symbols,
- )
-
- @torch.no_grad()
- def __call__(self, text: Union[list, str], split_size=20):
- data = {"text": text}
- result = self.preprocessor(data=data, uid="12938712838719")
- split_text = self.preprocessor.pop_split_text_data(result)
- mini_sentences = split_to_mini_sentence(split_text, split_size)
- mini_sentences_id = split_to_mini_sentence(data["text"], split_size)
- assert len(mini_sentences) == len(mini_sentences_id)
- cache_sent = []
- cache_sent_id = torch.from_numpy(np.array([], dtype='int32'))
- new_mini_sentence = ""
- new_mini_sentence_punc = []
- cache_pop_trigger_limit = 200
- for mini_sentence_i in range(len(mini_sentences)):
- mini_sentence = mini_sentences[mini_sentence_i]
- mini_sentence_id = mini_sentences_id[mini_sentence_i]
- mini_sentence = cache_sent + mini_sentence
- mini_sentence_id = np.concatenate((cache_sent_id, mini_sentence_id), axis=0)
- data = {
- "text": torch.unsqueeze(torch.from_numpy(mini_sentence_id), 0),
- "text_lengths": torch.from_numpy(np.array([len(mini_sentence_id)], dtype='int32')),
- }
- data = to_device(data, self.device)
- y, _ = self.wrapped_model(**data)
- _, indices = y.view(-1, y.shape[-1]).topk(1, dim=1)
- punctuations = indices
- if indices.size()[0] != 1:
- punctuations = torch.squeeze(indices)
- assert punctuations.size()[0] == len(mini_sentence)
-
- # Search for the last Period/QuestionMark as cache
- if mini_sentence_i < len(mini_sentences) - 1:
- sentenceEnd = -1
- last_comma_index = -1
- for i in range(len(punctuations) - 2, 1, -1):
- if self.punc_list[punctuations[i]] == "。" or self.punc_list[punctuations[i]] == "?":
- sentenceEnd = i
- break
- if last_comma_index < 0 and self.punc_list[punctuations[i]] == ",":
- last_comma_index = i
-
- if sentenceEnd < 0 and len(mini_sentence) > cache_pop_trigger_limit and last_comma_index >= 0:
- # The sentence it too long, cut off at a comma.
- sentenceEnd = last_comma_index
- punctuations[sentenceEnd] = self.period
- cache_sent = mini_sentence[sentenceEnd + 1:]
- cache_sent_id = mini_sentence_id[sentenceEnd + 1:]
- mini_sentence = mini_sentence[0:sentenceEnd + 1]
- punctuations = punctuations[0:sentenceEnd + 1]
-
- # if len(punctuations) == 0:
- # continue
-
- punctuations_np = punctuations.cpu().numpy()
- new_mini_sentence_punc += [int(x) for x in punctuations_np]
- words_with_punc = []
- for i in range(len(mini_sentence)):
- if i > 0:
- if len(mini_sentence[i][0].encode()) == 1 and len(mini_sentence[i - 1][0].encode()) == 1:
- mini_sentence[i] = " " + mini_sentence[i]
- words_with_punc.append(mini_sentence[i])
- if self.punc_list[punctuations[i]] != "_":
- words_with_punc.append(self.punc_list[punctuations[i]])
- new_mini_sentence += "".join(words_with_punc)
- # Add Period for the end of the sentence
- new_mini_sentence_out = new_mini_sentence
- new_mini_sentence_punc_out = new_mini_sentence_punc
- if mini_sentence_i == len(mini_sentences) - 1:
- if new_mini_sentence[-1] == "," or new_mini_sentence[-1] == "、":
- new_mini_sentence_out = new_mini_sentence[:-1] + "。"
- new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period]
- elif new_mini_sentence[-1] != "。" and new_mini_sentence[-1] != "?":
- new_mini_sentence_out = new_mini_sentence + "。"
- new_mini_sentence_punc_out = new_mini_sentence_punc[:-1] + [self.period]
- return new_mini_sentence_out, new_mini_sentence_punc_out
-
-
-def inference(
- batch_size: int,
- dtype: str,
- ngpu: int,
- seed: int,
- num_workers: int,
- output_dir: str,
- log_level: Union[int, str],
- train_config: Optional[str],
- model_file: Optional[str],
- key_file: Optional[str] = None,
- data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None,
- raw_inputs: Union[List[Any], bytes, str] = None,
- cache: List[Any] = None,
- param_dict: dict = None,
- **kwargs,
-):
- inference_pipeline = inference_modelscope(
- output_dir=output_dir,
- batch_size=batch_size,
- dtype=dtype,
- ngpu=ngpu,
- seed=seed,
- num_workers=num_workers,
- log_level=log_level,
- key_file=key_file,
- train_config=train_config,
- model_file=model_file,
- param_dict=param_dict,
- **kwargs,
- )
- return inference_pipeline(data_path_and_name_and_type, raw_inputs)
-
-
-def inference_modelscope(
- batch_size: int,
- dtype: str,
- ngpu: int,
- seed: int,
- num_workers: int,
- log_level: Union[int, str],
- key_file: Optional[str],
- train_config: Optional[str],
- model_file: Optional[str],
- output_dir: Optional[str] = None,
- param_dict: dict = None,
- **kwargs,
-):
- assert check_argument_types()
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
-
- if ngpu >= 1 and torch.cuda.is_available():
- device = "cuda"
- else:
- device = "cpu"
-
- # 1. Set random-seed
- set_all_random_seed(seed)
- text2punc = Text2Punc(train_config, model_file, device)
-
- def _forward(
- data_path_and_name_and_type,
- raw_inputs: Union[List[Any], bytes, str] = None,
- output_dir_v2: Optional[str] = None,
- cache: List[Any] = None,
- param_dict: dict = None,
- ):
- results = []
- split_size = 20
-
- if raw_inputs != None:
- line = raw_inputs.strip()
- key = "demo"
- if line == "":
- item = {'key': key, 'value': ""}
- results.append(item)
- return results
- result, _ = text2punc(line)
- item = {'key': key, 'value': result}
- results.append(item)
- return results
-
- for inference_text, _, _ in data_path_and_name_and_type:
- with open(inference_text, "r", encoding="utf-8") as fin:
- for line in fin:
- line = line.strip()
- segs = line.split("\t")
- if len(segs) != 2:
- continue
- key = segs[0]
- if len(segs[1]) == 0:
- continue
- result, _ = text2punc(segs[1])
- item = {'key': key, 'value': result}
- results.append(item)
- output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
- if output_path != None:
- output_file_name = "infer.out"
- Path(output_path).mkdir(parents=True, exist_ok=True)
- output_file_path = (Path(output_path) / output_file_name).absolute()
- with open(output_file_path, "w", encoding="utf-8") as fout:
- for item_i in results:
- key_out = item_i["key"]
- value_out = item_i["value"]
- fout.write(f"{key_out}\t{value_out}\n")
- return results
-
- return _forward
-
-
-def get_parser():
- parser = config_argparse.ArgumentParser(
- description="Punctuation inference",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- )
-
- parser.add_argument(
- "--log_level",
- type=lambda x: x.upper(),
- default="INFO",
- choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
- help="The verbose level of logging",
- )
-
- parser.add_argument("--output_dir", type=str, required=False)
- parser.add_argument(
- "--ngpu",
- type=int,
- default=0,
- help="The number of gpus. 0 indicates CPU mode",
- )
- parser.add_argument("--seed", type=int, default=0, help="Random seed")
- parser.add_argument(
- "--dtype",
- default="float32",
- choices=["float16", "float32", "float64"],
- help="Data type",
- )
- parser.add_argument(
- "--num_workers",
- type=int,
- default=1,
- help="The number of workers used for DataLoader",
- )
- parser.add_argument(
- "--batch_size",
- type=int,
- default=1,
- help="The batch size for inference",
- )
-
- group = parser.add_argument_group("Input data related")
- group.add_argument("--data_path_and_name_and_type", type=str2triple_str, action="append", required=False)
- group.add_argument("--raw_inputs", type=str, required=False)
- group.add_argument("--cache", type=list, required=False)
- group.add_argument("--param_dict", type=dict, required=False)
- group.add_argument("--key_file", type=str_or_none)
-
- group = parser.add_argument_group("The model configuration related")
- group.add_argument("--train_config", type=str)
- group.add_argument("--model_file", type=str)
-
- return parser
-
-
-def main(cmd=None):
- print(get_commandline_args(), file=sys.stderr)
- parser = get_parser()
- args = parser.parse_args(cmd)
- kwargs = vars(args)
- # kwargs.pop("config", None)
- inference(**kwargs)
-
-
-if __name__ == "__main__":
- main()
diff --git a/funasr/bin/sond_inference.py b/funasr/bin/sond_inference.py
deleted file mode 100755
index c55bc3544..000000000
--- a/funasr/bin/sond_inference.py
+++ /dev/null
@@ -1,577 +0,0 @@
-#!/usr/bin/env python3
-# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
-# MIT License (https://opensource.org/licenses/MIT)
-
-import argparse
-import logging
-import os
-import sys
-from pathlib import Path
-from typing import Any
-from typing import List
-from typing import Optional
-from typing import Sequence
-from typing import Tuple
-from typing import Union
-
-from collections import OrderedDict
-import numpy as np
-import soundfile
-import torch
-from torch.nn import functional as F
-from typeguard import check_argument_types
-from typeguard import check_return_type
-
-from funasr.utils.cli_utils import get_commandline_args
-from funasr.tasks.diar import DiarTask
-from funasr.tasks.asr import ASRTask
-from funasr.torch_utils.device_funcs import to_device
-from funasr.torch_utils.set_all_random_seed import set_all_random_seed
-from funasr.utils import config_argparse
-from funasr.utils.types import str2bool
-from funasr.utils.types import str2triple_str
-from funasr.utils.types import str_or_none
-from scipy.ndimage import median_filter
-from funasr.utils.misc import statistic_model_parameters
-from funasr.datasets.iterable_dataset import load_bytes
-
-
-class Speech2Diarization:
- """Speech2Xvector class
-
- Examples:
- >>> import soundfile
- >>> import numpy as np
- >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
- >>> profile = np.load("profiles.npy")
- >>> audio, rate = soundfile.read("speech.wav")
- >>> speech2diar(audio, profile)
- {"spk1": [(int, int), ...], ...}
-
- """
-
- def __init__(
- self,
- diar_train_config: Union[Path, str] = None,
- diar_model_file: Union[Path, str] = None,
- device: Union[str, torch.device] = "cpu",
- batch_size: int = 1,
- dtype: str = "float32",
- streaming: bool = False,
- smooth_size: int = 83,
- dur_threshold: float = 10,
- ):
- assert check_argument_types()
-
- # TODO: 1. Build Diarization model
- diar_model, diar_train_args = DiarTask.build_model_from_file(
- config_file=diar_train_config,
- model_file=diar_model_file,
- device=device
- )
- logging.info("diar_model: {}".format(diar_model))
- logging.info("model parameter number: {}".format(statistic_model_parameters(diar_model)))
- logging.info("diar_train_args: {}".format(diar_train_args))
- diar_model.to(dtype=getattr(torch, dtype)).eval()
-
- self.diar_model = diar_model
- self.diar_train_args = diar_train_args
- self.token_list = diar_train_args.token_list
- self.smooth_size = smooth_size
- self.dur_threshold = dur_threshold
- self.device = device
- self.dtype = dtype
-
- def smooth_multi_labels(self, multi_label):
- multi_label = median_filter(multi_label, (self.smooth_size, 1), mode="constant", cval=0.0).astype(int)
- return multi_label
-
- @staticmethod
- def calc_spk_turns(label_arr, spk_list):
- turn_list = []
- length = label_arr.shape[0]
- n_spk = label_arr.shape[1]
- for k in range(n_spk):
- if spk_list[k] == "None":
- continue
- in_utt = False
- start = 0
- for i in range(length):
- if label_arr[i, k] == 1 and in_utt is False:
- start = i
- in_utt = True
- if label_arr[i, k] == 0 and in_utt is True:
- turn_list.append([spk_list[k], start, i - start])
- in_utt = False
- if in_utt:
- turn_list.append([spk_list[k], start, length - start])
- return turn_list
-
- @staticmethod
- def seq2arr(seq, vec_dim=8):
- def int2vec(x, vec_dim=8, dtype=np.int):
- b = ('{:0' + str(vec_dim) + 'b}').format(x)
- # little-endian order: lower bit first
- return (np.array(list(b)[::-1]) == '1').astype(dtype)
-
- # process oov
- seq = np.array([int(x) for x in seq])
- new_seq = []
- for i, x in enumerate(seq):
- if x < 2 ** vec_dim:
- new_seq.append(x)
- else:
- idx_list = np.where(seq < 2 ** vec_dim)[0]
- idx = np.abs(idx_list - i).argmin()
- new_seq.append(seq[idx_list[idx]])
- return np.row_stack([int2vec(x, vec_dim) for x in new_seq])
-
- def post_processing(self, raw_logits: torch.Tensor, spk_num: int, output_format: str = "speaker_turn"):
- logits_idx = raw_logits.argmax(-1) # B, T, vocab_size -> B, T
- # upsampling outputs to match inputs
- ut = logits_idx.shape[1] * self.diar_model.encoder.time_ds_ratio
- logits_idx = F.upsample(
- logits_idx.unsqueeze(1).float(),
- size=(ut, ),
- mode="nearest",
- ).squeeze(1).long()
- logits_idx = logits_idx[0].tolist()
- pse_labels = [self.token_list[x] for x in logits_idx]
- if output_format == "pse_labels":
- return pse_labels, None
-
- multi_labels = self.seq2arr(pse_labels, spk_num)[:, :spk_num] # remove padding speakers
- multi_labels = self.smooth_multi_labels(multi_labels)
- if output_format == "binary_labels":
- return multi_labels, None
-
- spk_list = ["spk{}".format(i + 1) for i in range(spk_num)]
- spk_turns = self.calc_spk_turns(multi_labels, spk_list)
- results = OrderedDict()
- for spk, st, dur in spk_turns:
- if spk not in results:
- results[spk] = []
- if dur > self.dur_threshold:
- results[spk].append((st, st+dur))
-
- # sort segments in start time ascending
- for spk in results:
- results[spk] = sorted(results[spk], key=lambda x: x[0])
-
- return results, pse_labels
-
- @torch.no_grad()
- def __call__(
- self,
- speech: Union[torch.Tensor, np.ndarray],
- profile: Union[torch.Tensor, np.ndarray],
- output_format: str = "speaker_turn"
- ):
- """Inference
-
- Args:
- speech: Input speech data
- profile: Speaker profiles
- Returns:
- diarization results for each speaker
-
- """
- assert check_argument_types()
- # Input as audio signal
- if isinstance(speech, np.ndarray):
- speech = torch.tensor(speech)
- if isinstance(profile, np.ndarray):
- profile = torch.tensor(profile)
-
- # data: (Nsamples,) -> (1, Nsamples)
- speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
- profile = profile.unsqueeze(0).to(getattr(torch, self.dtype))
- # lengths: (1,)
- speech_lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
- profile_lengths = profile.new_full([1], dtype=torch.long, fill_value=profile.size(1))
- batch = {"speech": speech, "speech_lengths": speech_lengths,
- "profile": profile, "profile_lengths": profile_lengths}
- # a. To device
- batch = to_device(batch, device=self.device)
-
- logits = self.diar_model.prediction_forward(**batch)
- results, pse_labels = self.post_processing(logits, profile.shape[1], output_format)
-
- return results, pse_labels
-
- @staticmethod
- def from_pretrained(
- model_tag: Optional[str] = None,
- **kwargs: Optional[Any],
- ):
- """Build Speech2Xvector instance from the pretrained model.
-
- Args:
- model_tag (Optional[str]): Model tag of the pretrained models.
- Currently, the tags of espnet_model_zoo are supported.
-
- Returns:
- Speech2Xvector: Speech2Xvector instance.
-
- """
- if model_tag is not None:
- try:
- from espnet_model_zoo.downloader import ModelDownloader
-
- except ImportError:
- logging.error(
- "`espnet_model_zoo` is not installed. "
- "Please install via `pip install -U espnet_model_zoo`."
- )
- raise
- d = ModelDownloader()
- kwargs.update(**d.download_and_unpack(model_tag))
-
- return Speech2Diarization(**kwargs)
-
-
-def inference_modelscope(
- diar_train_config: str,
- diar_model_file: str,
- output_dir: Optional[str] = None,
- batch_size: int = 1,
- dtype: str = "float32",
- ngpu: int = 0,
- seed: int = 0,
- num_workers: int = 0,
- log_level: Union[int, str] = "INFO",
- key_file: Optional[str] = None,
- model_tag: Optional[str] = None,
- allow_variable_data_keys: bool = True,
- streaming: bool = False,
- smooth_size: int = 83,
- dur_threshold: int = 10,
- out_format: str = "vad",
- param_dict: Optional[dict] = None,
- mode: str = "sond",
- **kwargs,
-):
- assert check_argument_types()
- ncpu = kwargs.get("ncpu", 1)
- torch.set_num_threads(ncpu)
- if batch_size > 1:
- raise NotImplementedError("batch decoding is not implemented")
- if ngpu > 1:
- raise NotImplementedError("only single GPU decoding is supported")
-
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
- logging.info("param_dict: {}".format(param_dict))
-
- if ngpu >= 1 and torch.cuda.is_available():
- device = "cuda"
- else:
- device = "cpu"
-
- # 1. Set random-seed
- set_all_random_seed(seed)
-
- # 2a. Build speech2xvec [Optional]
- if mode == "sond_demo" and param_dict is not None and "extract_profile" in param_dict and param_dict["extract_profile"]:
- assert "sv_train_config" in param_dict, "sv_train_config must be provided param_dict."
- assert "sv_model_file" in param_dict, "sv_model_file must be provided in param_dict."
- sv_train_config = param_dict["sv_train_config"]
- sv_model_file = param_dict["sv_model_file"]
- if "model_dir" in param_dict:
- sv_train_config = os.path.join(param_dict["model_dir"], sv_train_config)
- sv_model_file = os.path.join(param_dict["model_dir"], sv_model_file)
- from funasr.bin.sv_inference import Speech2Xvector
- speech2xvector_kwargs = dict(
- sv_train_config=sv_train_config,
- sv_model_file=sv_model_file,
- device=device,
- dtype=dtype,
- streaming=streaming,
- embedding_node="resnet1_dense"
- )
- logging.info("speech2xvector_kwargs: {}".format(speech2xvector_kwargs))
- speech2xvector = Speech2Xvector.from_pretrained(
- model_tag=model_tag,
- **speech2xvector_kwargs,
- )
- speech2xvector.sv_model.eval()
-
- # 2b. Build speech2diar
- speech2diar_kwargs = dict(
- diar_train_config=diar_train_config,
- diar_model_file=diar_model_file,
- device=device,
- dtype=dtype,
- streaming=streaming,
- smooth_size=smooth_size,
- dur_threshold=dur_threshold,
- )
- logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs))
- speech2diar = Speech2Diarization.from_pretrained(
- model_tag=model_tag,
- **speech2diar_kwargs,
- )
- speech2diar.diar_model.eval()
-
- def output_results_str(results: dict, uttid: str):
- rst = []
- mid = uttid.rsplit("-", 1)[0]
- for key in results:
- results[key] = [(x[0]/100, x[1]/100) for x in results[key]]
- if out_format == "vad":
- for spk, segs in results.items():
- rst.append("{} {}".format(spk, segs))
- else:
- template = "SPEAKER {} 0 {:.2f} {:.2f} {} "
- for spk, segs in results.items():
- rst.extend([template.format(mid, st, ed, spk) for st, ed in segs])
-
- return "\n".join(rst)
-
- def _forward(
- data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None,
- raw_inputs: List[List[Union[np.ndarray, torch.Tensor, str, bytes]]] = None,
- output_dir_v2: Optional[str] = None,
- param_dict: Optional[dict] = None,
- ):
- logging.info("param_dict: {}".format(param_dict))
- if data_path_and_name_and_type is None and raw_inputs is not None:
- if isinstance(raw_inputs, (list, tuple)):
- if not isinstance(raw_inputs[0], List):
- raw_inputs = [raw_inputs]
-
- assert all([len(example) >= 2 for example in raw_inputs]), \
- "The length of test case in raw_inputs must larger than 1 (>=2)."
-
- def prepare_dataset():
- for idx, example in enumerate(raw_inputs):
- # read waveform file
- example = [load_bytes(x) if isinstance(x, bytes) else x
- for x in example]
- example = [soundfile.read(x)[0] if isinstance(x, str) else x
- for x in example]
- # convert torch tensor to numpy array
- example = [x.numpy() if isinstance(example[0], torch.Tensor) else x
- for x in example]
- speech = example[0]
- logging.info("Extracting profiles for {} waveforms".format(len(example)-1))
- profile = [speech2xvector.calculate_embedding(x) for x in example[1:]]
- profile = torch.cat(profile, dim=0)
- yield ["test{}".format(idx)], {"speech": [speech], "profile": [profile]}
-
- loader = prepare_dataset()
- else:
- raise TypeError("raw_inputs must be a list or tuple in [speech, profile1, profile2, ...] ")
- else:
- # 3. Build data-iterator
- loader = ASRTask.build_streaming_iterator(
- data_path_and_name_and_type,
- dtype=dtype,
- batch_size=batch_size,
- key_file=key_file,
- num_workers=num_workers,
- preprocess_fn=None,
- collate_fn=None,
- allow_variable_data_keys=allow_variable_data_keys,
- inference=True,
- )
-
- # 7. Start for-loop
- output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
- if output_path is not None:
- os.makedirs(output_path, exist_ok=True)
- output_writer = open("{}/result.txt".format(output_path), "w")
- pse_label_writer = open("{}/labels.txt".format(output_path), "w")
- logging.info("Start to diarize...")
- result_list = []
- for idx, (keys, batch) in enumerate(loader):
- assert isinstance(batch, dict), type(batch)
- assert all(isinstance(s, str) for s in keys), keys
- _bs = len(next(iter(batch.values())))
- assert len(keys) == _bs, f"{len(keys)} != {_bs}"
- batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
-
- results, pse_labels = speech2diar(**batch)
- # Only supporting batch_size==1
- key, value = keys[0], output_results_str(results, keys[0])
- item = {"key": key, "value": value}
- result_list.append(item)
- if output_path is not None:
- output_writer.write(value)
- output_writer.flush()
- pse_label_writer.write("{} {}\n".format(key, " ".join(pse_labels)))
- pse_label_writer.flush()
-
- if idx % 100 == 0:
- logging.info("Processing {:5d}: {}".format(idx, key))
-
- if output_path is not None:
- output_writer.close()
- pse_label_writer.close()
-
- return result_list
-
- return _forward
-
-
-def inference(
- data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
- diar_train_config: Optional[str],
- diar_model_file: Optional[str],
- output_dir: Optional[str] = None,
- batch_size: int = 1,
- dtype: str = "float32",
- ngpu: int = 0,
- seed: int = 0,
- num_workers: int = 1,
- log_level: Union[int, str] = "INFO",
- key_file: Optional[str] = None,
- model_tag: Optional[str] = None,
- allow_variable_data_keys: bool = True,
- streaming: bool = False,
- smooth_size: int = 83,
- dur_threshold: int = 10,
- out_format: str = "vad",
- **kwargs,
-):
- inference_pipeline = inference_modelscope(
- diar_train_config=diar_train_config,
- diar_model_file=diar_model_file,
- output_dir=output_dir,
- batch_size=batch_size,
- dtype=dtype,
- ngpu=ngpu,
- seed=seed,
- num_workers=num_workers,
- log_level=log_level,
- key_file=key_file,
- model_tag=model_tag,
- allow_variable_data_keys=allow_variable_data_keys,
- streaming=streaming,
- smooth_size=smooth_size,
- dur_threshold=dur_threshold,
- out_format=out_format,
- **kwargs,
- )
-
- return inference_pipeline(data_path_and_name_and_type, raw_inputs=None)
-
-
-def get_parser():
- parser = config_argparse.ArgumentParser(
- description="Speaker verification/x-vector extraction",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- )
-
- # Note(kamo): Use '_' instead of '-' as separator.
- # '-' is confusing if written in yaml.
- parser.add_argument(
- "--log_level",
- type=lambda x: x.upper(),
- default="INFO",
- choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
- help="The verbose level of logging",
- )
-
- parser.add_argument("--output_dir", type=str, required=False)
- parser.add_argument(
- "--ngpu",
- type=int,
- default=0,
- help="The number of gpus. 0 indicates CPU mode",
- )
- parser.add_argument(
- "--gpuid_list",
- type=str,
- default="",
- help="The visible gpus",
- )
- parser.add_argument("--seed", type=int, default=0, help="Random seed")
- parser.add_argument(
- "--dtype",
- default="float32",
- choices=["float16", "float32", "float64"],
- help="Data type",
- )
- parser.add_argument(
- "--num_workers",
- type=int,
- default=1,
- help="The number of workers used for DataLoader",
- )
-
- group = parser.add_argument_group("Input data related")
- group.add_argument(
- "--data_path_and_name_and_type",
- type=str2triple_str,
- required=False,
- action="append",
- )
- group.add_argument("--key_file", type=str_or_none)
- group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
-
- group = parser.add_argument_group("The model configuration related")
- group.add_argument(
- "--diar_train_config",
- type=str,
- help="diarization training configuration",
- )
- group.add_argument(
- "--diar_model_file",
- type=str,
- help="diarization model parameter file",
- )
- group.add_argument(
- "--dur_threshold",
- type=int,
- default=10,
- help="The threshold for short segments in number frames"
- )
- parser.add_argument(
- "--smooth_size",
- type=int,
- default=83,
- help="The smoothing window length in number frames"
- )
- group.add_argument(
- "--model_tag",
- type=str,
- help="Pretrained model tag. If specify this option, *_train_config and "
- "*_file will be overwritten",
- )
- parser.add_argument(
- "--batch_size",
- type=int,
- default=1,
- help="The batch size for inference",
- )
- parser.add_argument("--streaming", type=str2bool, default=False)
-
- return parser
-
-
-def main(cmd=None):
- print(get_commandline_args(), file=sys.stderr)
- parser = get_parser()
- args = parser.parse_args(cmd)
- kwargs = vars(args)
- kwargs.pop("config", None)
- logging.info("args: {}".format(kwargs))
- if args.output_dir is None:
- jobid, n_gpu = 1, 1
- gpuid = args.gpuid_list.split(",")[jobid-1]
- else:
- jobid = int(args.output_dir.split(".")[-1])
- n_gpu = len(args.gpuid_list.split(","))
- gpuid = args.gpuid_list.split(",")[(jobid - 1) % n_gpu]
- os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
- os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
- results_list = inference(**kwargs)
- for results in results_list:
- print("{} {}".format(results["key"], results["value"]))
-
-
-if __name__ == "__main__":
- main()
diff --git a/funasr/bin/sv_infer.py b/funasr/bin/sv_infer.py
new file mode 100755
index 000000000..8a9c6e9f3
--- /dev/null
+++ b/funasr/bin/sv_infer.py
@@ -0,0 +1,163 @@
+#!/usr/bin/env python3
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+# MIT License (https://opensource.org/licenses/MIT)
+
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Any
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import torch
+from kaldiio import WriteHelper
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.tasks.sv import SVTask
+from funasr.tasks.asr import ASRTask
+from funasr.torch_utils.device_funcs import to_device
+from funasr.torch_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import config_argparse
+from funasr.utils.types import str2bool
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+from funasr.utils.misc import statistic_model_parameters
+
+class Speech2Xvector:
+ """Speech2Xvector class
+
+ Examples:
+ >>> import soundfile
+ >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pb")
+ >>> audio, rate = soundfile.read("speech.wav")
+ >>> speech2xvector(audio)
+ [(text, token, token_int, hypothesis object), ...]
+
+ """
+
+ def __init__(
+ self,
+ sv_train_config: Union[Path, str] = None,
+ sv_model_file: Union[Path, str] = None,
+ device: str = "cpu",
+ batch_size: int = 1,
+ dtype: str = "float32",
+ streaming: bool = False,
+ embedding_node: str = "resnet1_dense",
+ ):
+ assert check_argument_types()
+
+ # TODO: 1. Build SV model
+ sv_model, sv_train_args = SVTask.build_model_from_file(
+ config_file=sv_train_config,
+ model_file=sv_model_file,
+ device=device
+ )
+ logging.info("sv_model: {}".format(sv_model))
+ logging.info("model parameter number: {}".format(statistic_model_parameters(sv_model)))
+ logging.info("sv_train_args: {}".format(sv_train_args))
+ sv_model.to(dtype=getattr(torch, dtype)).eval()
+
+ self.sv_model = sv_model
+ self.sv_train_args = sv_train_args
+ self.device = device
+ self.dtype = dtype
+ self.embedding_node = embedding_node
+
+ @torch.no_grad()
+ def calculate_embedding(self, speech: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
+ # Input as audio signal
+ if isinstance(speech, np.ndarray):
+ speech = torch.tensor(speech)
+
+ # data: (Nsamples,) -> (1, Nsamples)
+ speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+ # lengths: (1,)
+ lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+ batch = {"speech": speech, "speech_lengths": lengths}
+
+ # a. To device
+ batch = to_device(batch, device=self.device)
+
+ # b. Forward Encoder
+ enc, ilens = self.sv_model.encode(**batch)
+
+ # c. Forward Pooling
+ pooling = self.sv_model.pooling_layer(enc)
+
+ # d. Forward Decoder
+ outputs, embeddings = self.sv_model.decoder(pooling)
+
+ if self.embedding_node not in embeddings:
+ raise ValueError("Required embedding node {} not in {}".format(
+ self.embedding_node, embeddings.keys()))
+
+ return embeddings[self.embedding_node]
+
+ @torch.no_grad()
+ def __call__(
+ self, speech: Union[torch.Tensor, np.ndarray],
+ ref_speech: Optional[Union[torch.Tensor, np.ndarray]] = None,
+ ) -> Tuple[torch.Tensor, Union[torch.Tensor, None], Union[torch.Tensor, None]]:
+ """Inference
+
+ Args:
+ speech: Input speech data
+ ref_speech: Reference speech to compare
+ Returns:
+ embedding, ref_embedding, similarity_score
+
+ """
+ assert check_argument_types()
+ self.sv_model.eval()
+ embedding = self.calculate_embedding(speech)
+ ref_emb, score = None, None
+ if ref_speech is not None:
+ ref_emb = self.calculate_embedding(ref_speech)
+ score = torch.cosine_similarity(embedding, ref_emb)
+
+ results = (embedding, ref_emb, score)
+ assert check_return_type(results)
+ return results
+
+ @staticmethod
+ def from_pretrained(
+ model_tag: Optional[str] = None,
+ **kwargs: Optional[Any],
+ ):
+ """Build Speech2Xvector instance from the pretrained model.
+
+ Args:
+ model_tag (Optional[str]): Model tag of the pretrained models.
+ Currently, the tags of espnet_model_zoo are supported.
+
+ Returns:
+ Speech2Xvector: Speech2Xvector instance.
+
+ """
+ if model_tag is not None:
+ try:
+ from espnet_model_zoo.downloader import ModelDownloader
+
+ except ImportError:
+ logging.error(
+ "`espnet_model_zoo` is not installed. "
+ "Please install via `pip install -U espnet_model_zoo`."
+ )
+ raise
+ d = ModelDownloader()
+ kwargs.update(**d.download_and_unpack(model_tag))
+
+ return Speech2Xvector(**kwargs)
+
+
+
+
diff --git a/funasr/bin/sv_inference.py b/funasr/bin/sv_inference.py
deleted file mode 100755
index 76b1dfbb8..000000000
--- a/funasr/bin/sv_inference.py
+++ /dev/null
@@ -1,443 +0,0 @@
-#!/usr/bin/env python3
-# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
-# MIT License (https://opensource.org/licenses/MIT)
-
-import argparse
-import logging
-import os
-import sys
-from pathlib import Path
-from typing import Any
-from typing import List
-from typing import Optional
-from typing import Sequence
-from typing import Tuple
-from typing import Union
-
-import numpy as np
-import torch
-from kaldiio import WriteHelper
-from typeguard import check_argument_types
-from typeguard import check_return_type
-
-from funasr.utils.cli_utils import get_commandline_args
-from funasr.tasks.sv import SVTask
-from funasr.tasks.asr import ASRTask
-from funasr.torch_utils.device_funcs import to_device
-from funasr.torch_utils.set_all_random_seed import set_all_random_seed
-from funasr.utils import config_argparse
-from funasr.utils.types import str2bool
-from funasr.utils.types import str2triple_str
-from funasr.utils.types import str_or_none
-from funasr.utils.misc import statistic_model_parameters
-
-class Speech2Xvector:
- """Speech2Xvector class
-
- Examples:
- >>> import soundfile
- >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pb")
- >>> audio, rate = soundfile.read("speech.wav")
- >>> speech2xvector(audio)
- [(text, token, token_int, hypothesis object), ...]
-
- """
-
- def __init__(
- self,
- sv_train_config: Union[Path, str] = None,
- sv_model_file: Union[Path, str] = None,
- device: str = "cpu",
- batch_size: int = 1,
- dtype: str = "float32",
- streaming: bool = False,
- embedding_node: str = "resnet1_dense",
- ):
- assert check_argument_types()
-
- # TODO: 1. Build SV model
- sv_model, sv_train_args = SVTask.build_model_from_file(
- config_file=sv_train_config,
- model_file=sv_model_file,
- device=device
- )
- logging.info("sv_model: {}".format(sv_model))
- logging.info("model parameter number: {}".format(statistic_model_parameters(sv_model)))
- logging.info("sv_train_args: {}".format(sv_train_args))
- sv_model.to(dtype=getattr(torch, dtype)).eval()
-
- self.sv_model = sv_model
- self.sv_train_args = sv_train_args
- self.device = device
- self.dtype = dtype
- self.embedding_node = embedding_node
-
- @torch.no_grad()
- def calculate_embedding(self, speech: Union[torch.Tensor, np.ndarray]) -> torch.Tensor:
- # Input as audio signal
- if isinstance(speech, np.ndarray):
- speech = torch.tensor(speech)
-
- # data: (Nsamples,) -> (1, Nsamples)
- speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
- # lengths: (1,)
- lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
- batch = {"speech": speech, "speech_lengths": lengths}
-
- # a. To device
- batch = to_device(batch, device=self.device)
-
- # b. Forward Encoder
- enc, ilens = self.sv_model.encode(**batch)
-
- # c. Forward Pooling
- pooling = self.sv_model.pooling_layer(enc)
-
- # d. Forward Decoder
- outputs, embeddings = self.sv_model.decoder(pooling)
-
- if self.embedding_node not in embeddings:
- raise ValueError("Required embedding node {} not in {}".format(
- self.embedding_node, embeddings.keys()))
-
- return embeddings[self.embedding_node]
-
- @torch.no_grad()
- def __call__(
- self, speech: Union[torch.Tensor, np.ndarray],
- ref_speech: Optional[Union[torch.Tensor, np.ndarray]] = None,
- ) -> Tuple[torch.Tensor, Union[torch.Tensor, None], Union[torch.Tensor, None]]:
- """Inference
-
- Args:
- speech: Input speech data
- ref_speech: Reference speech to compare
- Returns:
- embedding, ref_embedding, similarity_score
-
- """
- assert check_argument_types()
- self.sv_model.eval()
- embedding = self.calculate_embedding(speech)
- ref_emb, score = None, None
- if ref_speech is not None:
- ref_emb = self.calculate_embedding(ref_speech)
- score = torch.cosine_similarity(embedding, ref_emb)
-
- results = (embedding, ref_emb, score)
- assert check_return_type(results)
- return results
-
- @staticmethod
- def from_pretrained(
- model_tag: Optional[str] = None,
- **kwargs: Optional[Any],
- ):
- """Build Speech2Xvector instance from the pretrained model.
-
- Args:
- model_tag (Optional[str]): Model tag of the pretrained models.
- Currently, the tags of espnet_model_zoo are supported.
-
- Returns:
- Speech2Xvector: Speech2Xvector instance.
-
- """
- if model_tag is not None:
- try:
- from espnet_model_zoo.downloader import ModelDownloader
-
- except ImportError:
- logging.error(
- "`espnet_model_zoo` is not installed. "
- "Please install via `pip install -U espnet_model_zoo`."
- )
- raise
- d = ModelDownloader()
- kwargs.update(**d.download_and_unpack(model_tag))
-
- return Speech2Xvector(**kwargs)
-
-
-def inference_modelscope(
- output_dir: Optional[str] = None,
- batch_size: int = 1,
- dtype: str = "float32",
- ngpu: int = 1,
- seed: int = 0,
- num_workers: int = 0,
- log_level: Union[int, str] = "INFO",
- key_file: Optional[str] = None,
- sv_train_config: Optional[str] = "sv.yaml",
- sv_model_file: Optional[str] = "sv.pb",
- model_tag: Optional[str] = None,
- allow_variable_data_keys: bool = True,
- streaming: bool = False,
- embedding_node: str = "resnet1_dense",
- sv_threshold: float = 0.9465,
- param_dict: Optional[dict] = None,
- **kwargs,
-):
- assert check_argument_types()
- ncpu = kwargs.get("ncpu", 1)
- torch.set_num_threads(ncpu)
-
- if batch_size > 1:
- raise NotImplementedError("batch decoding is not implemented")
- if ngpu > 1:
- raise NotImplementedError("only single GPU decoding is supported")
-
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
- logging.info("param_dict: {}".format(param_dict))
-
- if ngpu >= 1 and torch.cuda.is_available():
- device = "cuda"
- else:
- device = "cpu"
-
- # 1. Set random-seed
- set_all_random_seed(seed)
-
- # 2. Build speech2xvector
- speech2xvector_kwargs = dict(
- sv_train_config=sv_train_config,
- sv_model_file=sv_model_file,
- device=device,
- dtype=dtype,
- streaming=streaming,
- embedding_node=embedding_node
- )
- logging.info("speech2xvector_kwargs: {}".format(speech2xvector_kwargs))
- speech2xvector = Speech2Xvector.from_pretrained(
- model_tag=model_tag,
- **speech2xvector_kwargs,
- )
- speech2xvector.sv_model.eval()
-
- def _forward(
- data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- output_dir_v2: Optional[str] = None,
- param_dict: Optional[dict] = None,
- ):
- logging.info("param_dict: {}".format(param_dict))
- if data_path_and_name_and_type is None and raw_inputs is not None:
- if isinstance(raw_inputs, torch.Tensor):
- raw_inputs = raw_inputs.numpy()
- data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
-
- # 3. Build data-iterator
- loader = ASRTask.build_streaming_iterator(
- data_path_and_name_and_type,
- dtype=dtype,
- batch_size=batch_size,
- key_file=key_file,
- num_workers=num_workers,
- preprocess_fn=None,
- collate_fn=None,
- allow_variable_data_keys=allow_variable_data_keys,
- inference=True,
- )
-
- # 7 .Start for-loop
- output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
- embd_writer, ref_embd_writer, score_writer = None, None, None
- if output_path is not None:
- os.makedirs(output_path, exist_ok=True)
- embd_writer = WriteHelper("ark,scp:{}/xvector.ark,{}/xvector.scp".format(output_path, output_path))
- sv_result_list = []
- for keys, batch in loader:
- assert isinstance(batch, dict), type(batch)
- assert all(isinstance(s, str) for s in keys), keys
- _bs = len(next(iter(batch.values())))
- assert len(keys) == _bs, f"{len(keys)} != {_bs}"
- batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
-
- embedding, ref_embedding, score = speech2xvector(**batch)
- # Only supporting batch_size==1
- key = keys[0]
- normalized_score = 0.0
- if score is not None:
- score = score.item()
- normalized_score = max(score - sv_threshold, 0.0) / (1.0 - sv_threshold) * 100.0
- item = {"key": key, "value": normalized_score}
- else:
- item = {"key": key, "value": embedding.squeeze(0).cpu().numpy()}
- sv_result_list.append(item)
- if output_path is not None:
- embd_writer(key, embedding[0].cpu().numpy())
- if ref_embedding is not None:
- if ref_embd_writer is None:
- ref_embd_writer = WriteHelper(
- "ark,scp:{}/ref_xvector.ark,{}/ref_xvector.scp".format(output_path, output_path)
- )
- score_writer = open(os.path.join(output_path, "score.txt"), "w")
- ref_embd_writer(key, ref_embedding[0].cpu().numpy())
- score_writer.write("{} {:.6f}\n".format(key, normalized_score))
-
- if output_path is not None:
- embd_writer.close()
- if ref_embd_writer is not None:
- ref_embd_writer.close()
- score_writer.close()
-
- return sv_result_list
-
- return _forward
-
-
-def inference(
- output_dir: Optional[str],
- batch_size: int,
- dtype: str,
- ngpu: int,
- seed: int,
- num_workers: int,
- log_level: Union[int, str],
- data_path_and_name_and_type: Sequence[Tuple[str, str, str]],
- key_file: Optional[str],
- sv_train_config: Optional[str],
- sv_model_file: Optional[str],
- model_tag: Optional[str],
- allow_variable_data_keys: bool = True,
- streaming: bool = False,
- embedding_node: str = "resnet1_dense",
- sv_threshold: float = 0.9465,
- **kwargs,
-):
- inference_pipeline = inference_modelscope(
- output_dir=output_dir,
- batch_size=batch_size,
- dtype=dtype,
- ngpu=ngpu,
- seed=seed,
- num_workers=num_workers,
- log_level=log_level,
- key_file=key_file,
- sv_train_config=sv_train_config,
- sv_model_file=sv_model_file,
- model_tag=model_tag,
- allow_variable_data_keys=allow_variable_data_keys,
- streaming=streaming,
- embedding_node=embedding_node,
- sv_threshold=sv_threshold,
- **kwargs,
- )
-
- return inference_pipeline(data_path_and_name_and_type, raw_inputs=None)
-
-
-def get_parser():
- parser = config_argparse.ArgumentParser(
- description="Speaker verification/x-vector extraction",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- )
-
- # Note(kamo): Use '_' instead of '-' as separator.
- # '-' is confusing if written in yaml.
- parser.add_argument(
- "--log_level",
- type=lambda x: x.upper(),
- default="INFO",
- choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
- help="The verbose level of logging",
- )
-
- parser.add_argument("--output_dir", type=str, required=False)
- parser.add_argument(
- "--ngpu",
- type=int,
- default=0,
- help="The number of gpus. 0 indicates CPU mode",
- )
- parser.add_argument(
- "--gpuid_list",
- type=str,
- default="",
- help="The visible gpus",
- )
- parser.add_argument("--seed", type=int, default=0, help="Random seed")
- parser.add_argument(
- "--dtype",
- default="float32",
- choices=["float16", "float32", "float64"],
- help="Data type",
- )
- parser.add_argument(
- "--num_workers",
- type=int,
- default=1,
- help="The number of workers used for DataLoader",
- )
-
- group = parser.add_argument_group("Input data related")
- group.add_argument(
- "--data_path_and_name_and_type",
- type=str2triple_str,
- required=False,
- action="append",
- )
- group.add_argument("--key_file", type=str_or_none)
- group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
-
- group = parser.add_argument_group("The model configuration related")
- group.add_argument(
- "--sv_train_config",
- type=str,
- help="SV training configuration",
- )
- group.add_argument(
- "--sv_model_file",
- type=str,
- help="SV model parameter file",
- )
- group.add_argument(
- "--sv_threshold",
- type=float,
- default=0.9465,
- help="The threshold for verification"
- )
- group.add_argument(
- "--model_tag",
- type=str,
- help="Pretrained model tag. If specify this option, *_train_config and "
- "*_file will be overwritten",
- )
- parser.add_argument(
- "--batch_size",
- type=int,
- default=1,
- help="The batch size for inference",
- )
- parser.add_argument("--streaming", type=str2bool, default=False)
- parser.add_argument("--embedding_node", type=str, default="resnet1_dense")
-
- return parser
-
-
-def main(cmd=None):
- print(get_commandline_args(), file=sys.stderr)
- parser = get_parser()
- args = parser.parse_args(cmd)
- kwargs = vars(args)
- kwargs.pop("config", None)
- logging.info("args: {}".format(kwargs))
- if args.output_dir is None:
- jobid, n_gpu = 1, 1
- gpuid = args.gpuid_list.split(",")[jobid-1]
- else:
- jobid = int(args.output_dir.split(".")[-1])
- n_gpu = len(args.gpuid_list.split(","))
- gpuid = args.gpuid_list.split(",")[(jobid - 1) % n_gpu]
- os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
- os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
- results_list = inference(**kwargs)
- for results in results_list:
- print("{} {}".format(results["key"], results["value"]))
-
-
-if __name__ == "__main__":
- main()
diff --git a/funasr/bin/sv_inference_launch.py b/funasr/bin/sv_inference_launch.py
index 880607013..24b86386f 100755
--- a/funasr/bin/sv_inference_launch.py
+++ b/funasr/bin/sv_inference_launch.py
@@ -14,6 +14,164 @@ from funasr.utils.cli_utils import get_commandline_args
from funasr.utils.types import str2bool
from funasr.utils.types import str2triple_str
from funasr.utils.types import str_or_none
+import argparse
+import logging
+import os
+import sys
+from pathlib import Path
+from typing import Any
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import torch
+from kaldiio import WriteHelper
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.tasks.sv import SVTask
+from funasr.tasks.asr import ASRTask
+from funasr.torch_utils.device_funcs import to_device
+from funasr.torch_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import config_argparse
+from funasr.utils.types import str2bool
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+from funasr.utils.misc import statistic_model_parameters
+from funasr.bin.sv_infer import Speech2Xvector
+
+def inference_sv(
+ output_dir: Optional[str] = None,
+ batch_size: int = 1,
+ dtype: str = "float32",
+ ngpu: int = 1,
+ seed: int = 0,
+ num_workers: int = 0,
+ log_level: Union[int, str] = "INFO",
+ key_file: Optional[str] = None,
+ sv_train_config: Optional[str] = "sv.yaml",
+ sv_model_file: Optional[str] = "sv.pb",
+ model_tag: Optional[str] = None,
+ allow_variable_data_keys: bool = True,
+ streaming: bool = False,
+ embedding_node: str = "resnet1_dense",
+ sv_threshold: float = 0.9465,
+ param_dict: Optional[dict] = None,
+ **kwargs,
+):
+ assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+
+ if batch_size > 1:
+ raise NotImplementedError("batch decoding is not implemented")
+ if ngpu > 1:
+ raise NotImplementedError("only single GPU decoding is supported")
+
+ logging.basicConfig(
+ level=log_level,
+ format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+ )
+ logging.info("param_dict: {}".format(param_dict))
+
+ if ngpu >= 1 and torch.cuda.is_available():
+ device = "cuda"
+ else:
+ device = "cpu"
+
+ # 1. Set random-seed
+ set_all_random_seed(seed)
+
+ # 2. Build speech2xvector
+ speech2xvector_kwargs = dict(
+ sv_train_config=sv_train_config,
+ sv_model_file=sv_model_file,
+ device=device,
+ dtype=dtype,
+ streaming=streaming,
+ embedding_node=embedding_node
+ )
+ logging.info("speech2xvector_kwargs: {}".format(speech2xvector_kwargs))
+ speech2xvector = Speech2Xvector.from_pretrained(
+ model_tag=model_tag,
+ **speech2xvector_kwargs,
+ )
+ speech2xvector.sv_model.eval()
+
+ def _forward(
+ data_path_and_name_and_type: Sequence[Tuple[str, str, str]] = None,
+ raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ output_dir_v2: Optional[str] = None,
+ param_dict: Optional[dict] = None,
+ ):
+ logging.info("param_dict: {}".format(param_dict))
+ if data_path_and_name_and_type is None and raw_inputs is not None:
+ if isinstance(raw_inputs, torch.Tensor):
+ raw_inputs = raw_inputs.numpy()
+ data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+
+ # 3. Build data-iterator
+ loader = ASRTask.build_streaming_iterator(
+ data_path_and_name_and_type,
+ dtype=dtype,
+ batch_size=batch_size,
+ key_file=key_file,
+ num_workers=num_workers,
+ preprocess_fn=None,
+ collate_fn=None,
+ allow_variable_data_keys=allow_variable_data_keys,
+ inference=True,
+ )
+
+ # 7 .Start for-loop
+ output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
+ embd_writer, ref_embd_writer, score_writer = None, None, None
+ if output_path is not None:
+ os.makedirs(output_path, exist_ok=True)
+ embd_writer = WriteHelper("ark,scp:{}/xvector.ark,{}/xvector.scp".format(output_path, output_path))
+ sv_result_list = []
+ for keys, batch in loader:
+ assert isinstance(batch, dict), type(batch)
+ assert all(isinstance(s, str) for s in keys), keys
+ _bs = len(next(iter(batch.values())))
+ assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+ batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+
+ embedding, ref_embedding, score = speech2xvector(**batch)
+ # Only supporting batch_size==1
+ key = keys[0]
+ normalized_score = 0.0
+ if score is not None:
+ score = score.item()
+ normalized_score = max(score - sv_threshold, 0.0) / (1.0 - sv_threshold) * 100.0
+ item = {"key": key, "value": normalized_score}
+ else:
+ item = {"key": key, "value": embedding.squeeze(0).cpu().numpy()}
+ sv_result_list.append(item)
+ if output_path is not None:
+ embd_writer(key, embedding[0].cpu().numpy())
+ if ref_embedding is not None:
+ if ref_embd_writer is None:
+ ref_embd_writer = WriteHelper(
+ "ark,scp:{}/ref_xvector.ark,{}/ref_xvector.scp".format(output_path, output_path)
+ )
+ score_writer = open(os.path.join(output_path, "score.txt"), "w")
+ ref_embd_writer(key, ref_embedding[0].cpu().numpy())
+ score_writer.write("{} {:.6f}\n".format(key, normalized_score))
+
+ if output_path is not None:
+ embd_writer.close()
+ if ref_embd_writer is not None:
+ ref_embd_writer.close()
+ score_writer.close()
+
+ return sv_result_list
+
+ return _forward
def get_parser():
@@ -133,8 +291,7 @@ def get_parser():
def inference_launch(mode, **kwargs):
if mode == "sv":
- from funasr.bin.sv_inference import inference_modelscope
- return inference_modelscope(**kwargs)
+ return inference_sv(**kwargs)
else:
logging.info("Unknown decoding mode: {}".format(mode))
return None
@@ -167,7 +324,8 @@ def main(cmd=None):
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
- inference_launch(**kwargs)
+ inference_pipeline = inference_launch(**kwargs)
+ return inference_pipeline(kwargs["data_path_and_name_and_type"])
if __name__ == "__main__":
diff --git a/funasr/bin/tp_infer.py b/funasr/bin/tp_infer.py
new file mode 100644
index 000000000..c83ceeaa4
--- /dev/null
+++ b/funasr/bin/tp_infer.py
@@ -0,0 +1,115 @@
+import argparse
+import logging
+from optparse import Option
+import sys
+import json
+from pathlib import Path
+from typing import Any
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+from typing import Dict
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+
+from funasr.fileio.datadir_writer import DatadirWriter
+from funasr.datasets.preprocessor import LMPreprocessor
+from funasr.tasks.asr import ASRTaskAligner as ASRTask
+from funasr.torch_utils.device_funcs import to_device
+from funasr.torch_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import config_argparse
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.utils.types import str2bool
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+from funasr.models.frontend.wav_frontend import WavFrontend
+from funasr.text.token_id_converter import TokenIDConverter
+from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
+
+
+
+
+class Speech2Timestamp:
+ def __init__(
+ self,
+ timestamp_infer_config: Union[Path, str] = None,
+ timestamp_model_file: Union[Path, str] = None,
+ timestamp_cmvn_file: Union[Path, str] = None,
+ device: str = "cpu",
+ dtype: str = "float32",
+ **kwargs,
+ ):
+ assert check_argument_types()
+ # 1. Build ASR model
+ tp_model, tp_train_args = ASRTask.build_model_from_file(
+ timestamp_infer_config, timestamp_model_file, device=device
+ )
+ if 'cuda' in device:
+ tp_model = tp_model.cuda() # force model to cuda
+
+ frontend = None
+ if tp_train_args.frontend is not None:
+ frontend = WavFrontend(cmvn_file=timestamp_cmvn_file, **tp_train_args.frontend_conf)
+
+ logging.info("tp_model: {}".format(tp_model))
+ logging.info("tp_train_args: {}".format(tp_train_args))
+ tp_model.to(dtype=getattr(torch, dtype)).eval()
+
+ logging.info(f"Decoding device={device}, dtype={dtype}")
+
+
+ self.tp_model = tp_model
+ self.tp_train_args = tp_train_args
+
+ token_list = self.tp_model.token_list
+ self.converter = TokenIDConverter(token_list=token_list)
+
+ self.device = device
+ self.dtype = dtype
+ self.frontend = frontend
+ self.encoder_downsampling_factor = 1
+ if tp_train_args.encoder_conf["input_layer"] == "conv2d":
+ self.encoder_downsampling_factor = 4
+
+ @torch.no_grad()
+ def __call__(
+ self,
+ speech: Union[torch.Tensor, np.ndarray],
+ speech_lengths: Union[torch.Tensor, np.ndarray] = None,
+ text_lengths: Union[torch.Tensor, np.ndarray] = None
+ ):
+ assert check_argument_types()
+
+ # Input as audio signal
+ if isinstance(speech, np.ndarray):
+ speech = torch.tensor(speech)
+ if self.frontend is not None:
+ feats, feats_len = self.frontend.forward(speech, speech_lengths)
+ feats = to_device(feats, device=self.device)
+ feats_len = feats_len.int()
+ self.tp_model.frontend = None
+ else:
+ feats = speech
+ feats_len = speech_lengths
+
+ # lfr_factor = max(1, (feats.size()[-1]//80)-1)
+ batch = {"speech": feats, "speech_lengths": feats_len}
+
+ # a. To device
+ batch = to_device(batch, device=self.device)
+
+ # b. Forward Encoder
+ enc, enc_len = self.tp_model.encode(**batch)
+ if isinstance(enc, tuple):
+ enc = enc[0]
+
+ # c. Forward Predictor
+ _, _, us_alphas, us_peaks = self.tp_model.calc_predictor_timestamp(enc, enc_len, text_lengths.to(self.device)+1)
+ return us_alphas, us_peaks
+
+
+
diff --git a/funasr/bin/tp_inference.py b/funasr/bin/tp_inference.py
deleted file mode 100644
index 6e513c5a0..000000000
--- a/funasr/bin/tp_inference.py
+++ /dev/null
@@ -1,399 +0,0 @@
-import argparse
-import logging
-from optparse import Option
-import sys
-import json
-from pathlib import Path
-from typing import Any
-from typing import List
-from typing import Optional
-from typing import Sequence
-from typing import Tuple
-from typing import Union
-from typing import Dict
-
-import numpy as np
-import torch
-from typeguard import check_argument_types
-
-from funasr.fileio.datadir_writer import DatadirWriter
-from funasr.datasets.preprocessor import LMPreprocessor
-from funasr.tasks.asr import ASRTaskAligner as ASRTask
-from funasr.torch_utils.device_funcs import to_device
-from funasr.torch_utils.set_all_random_seed import set_all_random_seed
-from funasr.utils import config_argparse
-from funasr.utils.cli_utils import get_commandline_args
-from funasr.utils.types import str2bool
-from funasr.utils.types import str2triple_str
-from funasr.utils.types import str_or_none
-from funasr.models.frontend.wav_frontend import WavFrontend
-from funasr.text.token_id_converter import TokenIDConverter
-from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
-
-
-header_colors = '\033[95m'
-end_colors = '\033[0m'
-
-global_asr_language: str = 'zh-cn'
-global_sample_rate: Union[int, Dict[Any, int]] = {
- 'audio_fs': 16000,
- 'model_fs': 16000
-}
-
-
-class SpeechText2Timestamp:
- def __init__(
- self,
- timestamp_infer_config: Union[Path, str] = None,
- timestamp_model_file: Union[Path, str] = None,
- timestamp_cmvn_file: Union[Path, str] = None,
- device: str = "cpu",
- dtype: str = "float32",
- **kwargs,
- ):
- assert check_argument_types()
- # 1. Build ASR model
- tp_model, tp_train_args = ASRTask.build_model_from_file(
- timestamp_infer_config, timestamp_model_file, device=device
- )
- if 'cuda' in device:
- tp_model = tp_model.cuda() # force model to cuda
-
- frontend = None
- if tp_train_args.frontend is not None:
- frontend = WavFrontend(cmvn_file=timestamp_cmvn_file, **tp_train_args.frontend_conf)
-
- logging.info("tp_model: {}".format(tp_model))
- logging.info("tp_train_args: {}".format(tp_train_args))
- tp_model.to(dtype=getattr(torch, dtype)).eval()
-
- logging.info(f"Decoding device={device}, dtype={dtype}")
-
-
- self.tp_model = tp_model
- self.tp_train_args = tp_train_args
-
- token_list = self.tp_model.token_list
- self.converter = TokenIDConverter(token_list=token_list)
-
- self.device = device
- self.dtype = dtype
- self.frontend = frontend
- self.encoder_downsampling_factor = 1
- if tp_train_args.encoder_conf["input_layer"] == "conv2d":
- self.encoder_downsampling_factor = 4
-
- @torch.no_grad()
- def __call__(
- self,
- speech: Union[torch.Tensor, np.ndarray],
- speech_lengths: Union[torch.Tensor, np.ndarray] = None,
- text_lengths: Union[torch.Tensor, np.ndarray] = None
- ):
- assert check_argument_types()
-
- # Input as audio signal
- if isinstance(speech, np.ndarray):
- speech = torch.tensor(speech)
- if self.frontend is not None:
- feats, feats_len = self.frontend.forward(speech, speech_lengths)
- feats = to_device(feats, device=self.device)
- feats_len = feats_len.int()
- self.tp_model.frontend = None
- else:
- feats = speech
- feats_len = speech_lengths
-
- # lfr_factor = max(1, (feats.size()[-1]//80)-1)
- batch = {"speech": feats, "speech_lengths": feats_len}
-
- # a. To device
- batch = to_device(batch, device=self.device)
-
- # b. Forward Encoder
- enc, enc_len = self.tp_model.encode(**batch)
- if isinstance(enc, tuple):
- enc = enc[0]
-
- # c. Forward Predictor
- _, _, us_alphas, us_peaks = self.tp_model.calc_predictor_timestamp(enc, enc_len, text_lengths.to(self.device)+1)
- return us_alphas, us_peaks
-
-
-def inference(
- batch_size: int,
- ngpu: int,
- log_level: Union[int, str],
- data_path_and_name_and_type,
- timestamp_infer_config: Optional[str],
- timestamp_model_file: Optional[str],
- timestamp_cmvn_file: Optional[str] = None,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- key_file: Optional[str] = None,
- allow_variable_data_keys: bool = False,
- output_dir: Optional[str] = None,
- dtype: str = "float32",
- seed: int = 0,
- num_workers: int = 1,
- split_with_space: bool = True,
- seg_dict_file: Optional[str] = None,
- **kwargs,
-):
- inference_pipeline = inference_modelscope(
- batch_size=batch_size,
- ngpu=ngpu,
- log_level=log_level,
- timestamp_infer_config=timestamp_infer_config,
- timestamp_model_file=timestamp_model_file,
- timestamp_cmvn_file=timestamp_cmvn_file,
- key_file=key_file,
- allow_variable_data_keys=allow_variable_data_keys,
- output_dir=output_dir,
- dtype=dtype,
- seed=seed,
- num_workers=num_workers,
- split_with_space=split_with_space,
- seg_dict_file=seg_dict_file,
- **kwargs,
- )
- return inference_pipeline(data_path_and_name_and_type, raw_inputs)
-
-
-def inference_modelscope(
- batch_size: int,
- ngpu: int,
- log_level: Union[int, str],
- # data_path_and_name_and_type,
- timestamp_infer_config: Optional[str],
- timestamp_model_file: Optional[str],
- timestamp_cmvn_file: Optional[str] = None,
- # raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- key_file: Optional[str] = None,
- allow_variable_data_keys: bool = False,
- output_dir: Optional[str] = None,
- dtype: str = "float32",
- seed: int = 0,
- num_workers: int = 1,
- split_with_space: bool = True,
- seg_dict_file: Optional[str] = None,
- **kwargs,
-):
- assert check_argument_types()
- ncpu = kwargs.get("ncpu", 1)
- torch.set_num_threads(ncpu)
-
- if batch_size > 1:
- raise NotImplementedError("batch decoding is not implemented")
- if ngpu > 1:
- raise NotImplementedError("only single GPU decoding is supported")
-
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
-
- if ngpu >= 1 and torch.cuda.is_available():
- device = "cuda"
- else:
- device = "cpu"
- # 1. Set random-seed
- set_all_random_seed(seed)
-
- # 2. Build speech2vadsegment
- speechtext2timestamp_kwargs = dict(
- timestamp_infer_config=timestamp_infer_config,
- timestamp_model_file=timestamp_model_file,
- timestamp_cmvn_file=timestamp_cmvn_file,
- device=device,
- dtype=dtype,
- )
- logging.info("speechtext2timestamp_kwargs: {}".format(speechtext2timestamp_kwargs))
- speechtext2timestamp = SpeechText2Timestamp(**speechtext2timestamp_kwargs)
-
- preprocessor = LMPreprocessor(
- train=False,
- token_type=speechtext2timestamp.tp_train_args.token_type,
- token_list=speechtext2timestamp.tp_train_args.token_list,
- bpemodel=None,
- text_cleaner=None,
- g2p_type=None,
- text_name="text",
- non_linguistic_symbols=speechtext2timestamp.tp_train_args.non_linguistic_symbols,
- split_with_space=split_with_space,
- seg_dict_file=seg_dict_file,
- )
-
- if output_dir is not None:
- writer = DatadirWriter(output_dir)
- tp_writer = writer[f"timestamp_prediction"]
- # ibest_writer["token_list"][""] = " ".join(speech2text.asr_train_args.token_list)
- else:
- tp_writer = None
-
- def _forward(
- data_path_and_name_and_type,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- output_dir_v2: Optional[str] = None,
- fs: dict = None,
- param_dict: dict = None,
- **kwargs
- ):
- output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
- writer = None
- if output_path is not None:
- writer = DatadirWriter(output_path)
- tp_writer = writer[f"timestamp_prediction"]
- else:
- tp_writer = None
- # 3. Build data-iterator
- if data_path_and_name_and_type is None and raw_inputs is not None:
- if isinstance(raw_inputs, torch.Tensor):
- raw_inputs = raw_inputs.numpy()
- data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
-
- loader = ASRTask.build_streaming_iterator(
- data_path_and_name_and_type,
- dtype=dtype,
- batch_size=batch_size,
- key_file=key_file,
- num_workers=num_workers,
- preprocess_fn=preprocessor,
- collate_fn=ASRTask.build_collate_fn(speechtext2timestamp.tp_train_args, False),
- allow_variable_data_keys=allow_variable_data_keys,
- inference=True,
- )
-
- tp_result_list = []
- for keys, batch in loader:
- assert isinstance(batch, dict), type(batch)
- assert all(isinstance(s, str) for s in keys), keys
- _bs = len(next(iter(batch.values())))
- assert len(keys) == _bs, f"{len(keys)} != {_bs}"
-
- logging.info("timestamp predicting, utt_id: {}".format(keys))
- _batch = {'speech':batch['speech'],
- 'speech_lengths':batch['speech_lengths'],
- 'text_lengths':batch['text_lengths']}
- us_alphas, us_cif_peak = speechtext2timestamp(**_batch)
-
- for batch_id in range(_bs):
- key = keys[batch_id]
- token = speechtext2timestamp.converter.ids2tokens(batch['text'][batch_id])
- ts_str, ts_list = ts_prediction_lfr6_standard(us_alphas[batch_id], us_cif_peak[batch_id], token, force_time_shift=-3.0)
- logging.warning(ts_str)
- item = {'key': key, 'value': ts_str, 'timestamp':ts_list}
- if tp_writer is not None:
- tp_writer["tp_sync"][key+'#'] = ts_str
- tp_writer["tp_time"][key+'#'] = str(ts_list)
- tp_result_list.append(item)
- return tp_result_list
-
- return _forward
-
-
-def get_parser():
- parser = config_argparse.ArgumentParser(
- description="Timestamp Prediction Inference",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- )
-
- # Note(kamo): Use '_' instead of '-' as separator.
- # '-' is confusing if written in yaml.
- parser.add_argument(
- "--log_level",
- type=lambda x: x.upper(),
- default="INFO",
- choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
- help="The verbose level of logging",
- )
-
- parser.add_argument("--output_dir", type=str, required=False)
- parser.add_argument(
- "--ngpu",
- type=int,
- default=0,
- help="The number of gpus. 0 indicates CPU mode",
- )
- parser.add_argument(
- "--gpuid_list",
- type=str,
- default="",
- help="The visible gpus",
- )
- parser.add_argument("--seed", type=int, default=0, help="Random seed")
- parser.add_argument(
- "--dtype",
- default="float32",
- choices=["float16", "float32", "float64"],
- help="Data type",
- )
- parser.add_argument(
- "--num_workers",
- type=int,
- default=0,
- help="The number of workers used for DataLoader",
- )
-
- group = parser.add_argument_group("Input data related")
- group.add_argument(
- "--data_path_and_name_and_type",
- type=str2triple_str,
- required=False,
- action="append",
- )
- group.add_argument("--raw_inputs", type=list, default=None)
- # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
- group.add_argument("--key_file", type=str_or_none)
- group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
-
- group = parser.add_argument_group("The model configuration related")
- group.add_argument(
- "--timestamp_infer_config",
- type=str,
- help="VAD infer configuration",
- )
- group.add_argument(
- "--timestamp_model_file",
- type=str,
- help="VAD model parameter file",
- )
- group.add_argument(
- "--timestamp_cmvn_file",
- type=str,
- help="Global cmvn file",
- )
-
- group = parser.add_argument_group("infer related")
- group.add_argument(
- "--batch_size",
- type=int,
- default=1,
- help="The batch size for inference",
- )
- group.add_argument(
- "--seg_dict_file",
- type=str,
- default=None,
- help="The batch size for inference",
- )
- group.add_argument(
- "--split_with_space",
- type=bool,
- default=False,
- help="The batch size for inference",
- )
-
- return parser
-
-
-def main(cmd=None):
- print(get_commandline_args(), file=sys.stderr)
- parser = get_parser()
- args = parser.parse_args(cmd)
- kwargs = vars(args)
- kwargs.pop("config", None)
- inference(**kwargs)
-
-
-if __name__ == "__main__":
- main()
diff --git a/funasr/bin/tp_inference_launch.py b/funasr/bin/tp_inference_launch.py
index 6cdff057d..2b2b2aebf 100644
--- a/funasr/bin/tp_inference_launch.py
+++ b/funasr/bin/tp_inference_launch.py
@@ -13,6 +13,171 @@ from funasr.utils.types import str2bool
from funasr.utils.types import str2triple_str
from funasr.utils.types import str_or_none
+import argparse
+import logging
+from optparse import Option
+import sys
+import json
+from pathlib import Path
+from typing import Any
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+from typing import Dict
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+
+from funasr.fileio.datadir_writer import DatadirWriter
+from funasr.datasets.preprocessor import LMPreprocessor
+from funasr.tasks.asr import ASRTaskAligner as ASRTask
+from funasr.torch_utils.device_funcs import to_device
+from funasr.torch_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import config_argparse
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.utils.types import str2bool
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+from funasr.models.frontend.wav_frontend import WavFrontend
+from funasr.text.token_id_converter import TokenIDConverter
+from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
+from funasr.bin.tp_infer import Speech2Timestamp
+
+def inference_tp(
+ batch_size: int,
+ ngpu: int,
+ log_level: Union[int, str],
+ # data_path_and_name_and_type,
+ timestamp_infer_config: Optional[str],
+ timestamp_model_file: Optional[str],
+ timestamp_cmvn_file: Optional[str] = None,
+ # raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ key_file: Optional[str] = None,
+ allow_variable_data_keys: bool = False,
+ output_dir: Optional[str] = None,
+ dtype: str = "float32",
+ seed: int = 0,
+ num_workers: int = 1,
+ split_with_space: bool = True,
+ seg_dict_file: Optional[str] = None,
+ **kwargs,
+):
+ assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+
+ if batch_size > 1:
+ raise NotImplementedError("batch decoding is not implemented")
+ if ngpu > 1:
+ raise NotImplementedError("only single GPU decoding is supported")
+
+ logging.basicConfig(
+ level=log_level,
+ format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+ )
+
+ if ngpu >= 1 and torch.cuda.is_available():
+ device = "cuda"
+ else:
+ device = "cpu"
+ # 1. Set random-seed
+ set_all_random_seed(seed)
+
+ # 2. Build speech2vadsegment
+ speechtext2timestamp_kwargs = dict(
+ timestamp_infer_config=timestamp_infer_config,
+ timestamp_model_file=timestamp_model_file,
+ timestamp_cmvn_file=timestamp_cmvn_file,
+ device=device,
+ dtype=dtype,
+ )
+ logging.info("speechtext2timestamp_kwargs: {}".format(speechtext2timestamp_kwargs))
+ speechtext2timestamp = Speech2Timestamp(**speechtext2timestamp_kwargs)
+
+ preprocessor = LMPreprocessor(
+ train=False,
+ token_type=speechtext2timestamp.tp_train_args.token_type,
+ token_list=speechtext2timestamp.tp_train_args.token_list,
+ bpemodel=None,
+ text_cleaner=None,
+ g2p_type=None,
+ text_name="text",
+ non_linguistic_symbols=speechtext2timestamp.tp_train_args.non_linguistic_symbols,
+ split_with_space=split_with_space,
+ seg_dict_file=seg_dict_file,
+ )
+
+ if output_dir is not None:
+ writer = DatadirWriter(output_dir)
+ tp_writer = writer[f"timestamp_prediction"]
+ # ibest_writer["token_list"][""] = " ".join(speech2text.asr_train_args.token_list)
+ else:
+ tp_writer = None
+
+ def _forward(
+ data_path_and_name_and_type,
+ raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ output_dir_v2: Optional[str] = None,
+ fs: dict = None,
+ param_dict: dict = None,
+ **kwargs
+ ):
+ output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
+ writer = None
+ if output_path is not None:
+ writer = DatadirWriter(output_path)
+ tp_writer = writer[f"timestamp_prediction"]
+ else:
+ tp_writer = None
+ # 3. Build data-iterator
+ if data_path_and_name_and_type is None and raw_inputs is not None:
+ if isinstance(raw_inputs, torch.Tensor):
+ raw_inputs = raw_inputs.numpy()
+ data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+
+ loader = ASRTask.build_streaming_iterator(
+ data_path_and_name_and_type,
+ dtype=dtype,
+ batch_size=batch_size,
+ key_file=key_file,
+ num_workers=num_workers,
+ preprocess_fn=preprocessor,
+ collate_fn=ASRTask.build_collate_fn(speechtext2timestamp.tp_train_args, False),
+ allow_variable_data_keys=allow_variable_data_keys,
+ inference=True,
+ )
+
+ tp_result_list = []
+ for keys, batch in loader:
+ assert isinstance(batch, dict), type(batch)
+ assert all(isinstance(s, str) for s in keys), keys
+ _bs = len(next(iter(batch.values())))
+ assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+
+ logging.info("timestamp predicting, utt_id: {}".format(keys))
+ _batch = {'speech': batch['speech'],
+ 'speech_lengths': batch['speech_lengths'],
+ 'text_lengths': batch['text_lengths']}
+ us_alphas, us_cif_peak = speechtext2timestamp(**_batch)
+
+ for batch_id in range(_bs):
+ key = keys[batch_id]
+ token = speechtext2timestamp.converter.ids2tokens(batch['text'][batch_id])
+ ts_str, ts_list = ts_prediction_lfr6_standard(us_alphas[batch_id], us_cif_peak[batch_id], token,
+ force_time_shift=-3.0)
+ logging.warning(ts_str)
+ item = {'key': key, 'value': ts_str, 'timestamp': ts_list}
+ if tp_writer is not None:
+ tp_writer["tp_sync"][key + '#'] = ts_str
+ tp_writer["tp_time"][key + '#'] = str(ts_list)
+ tp_result_list.append(item)
+ return tp_result_list
+
+ return _forward
+
def get_parser():
parser = config_argparse.ArgumentParser(
@@ -102,8 +267,7 @@ def get_parser():
def inference_launch(mode, **kwargs):
if mode == "tp_norm":
- from funasr.bin.tp_inference import inference_modelscope
- return inference_modelscope(**kwargs)
+ return inference_tp(**kwargs)
else:
logging.info("Unknown decoding mode: {}".format(mode))
return None
@@ -135,7 +299,9 @@ def main(cmd=None):
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
- inference_launch(**kwargs)
+ inference_pipeline = inference_launch(**kwargs)
+ return inference_pipeline(kwargs["data_path_and_name_and_type"])
+
if __name__ == "__main__":
diff --git a/funasr/bin/vad_infer.py b/funasr/bin/vad_infer.py
new file mode 100644
index 000000000..5835e77df
--- /dev/null
+++ b/funasr/bin/vad_infer.py
@@ -0,0 +1,196 @@
+import argparse
+import logging
+import os
+import sys
+import json
+from pathlib import Path
+from typing import Any
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+from typing import Dict
+
+import math
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from funasr.fileio.datadir_writer import DatadirWriter
+from funasr.modules.scorers.scorer_interface import BatchScorerInterface
+from funasr.modules.subsampling import TooShortUttError
+from funasr.tasks.vad import VADTask
+from funasr.torch_utils.device_funcs import to_device
+from funasr.torch_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import config_argparse
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.utils.types import str2bool
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+from funasr.utils import asr_utils, wav_utils, postprocess_utils
+from funasr.models.frontend.wav_frontend import WavFrontend, WavFrontendOnline
+
+
+
+class Speech2VadSegment:
+ """Speech2VadSegment class
+
+ Examples:
+ >>> import soundfile
+ >>> speech2segment = Speech2VadSegment("vad_config.yml", "vad.pt")
+ >>> audio, rate = soundfile.read("speech.wav")
+ >>> speech2segment(audio)
+ [[10, 230], [245, 450], ...]
+
+ """
+
+ def __init__(
+ self,
+ vad_infer_config: Union[Path, str] = None,
+ vad_model_file: Union[Path, str] = None,
+ vad_cmvn_file: Union[Path, str] = None,
+ device: str = "cpu",
+ batch_size: int = 1,
+ dtype: str = "float32",
+ **kwargs,
+ ):
+ assert check_argument_types()
+
+ # 1. Build vad model
+ vad_model, vad_infer_args = VADTask.build_model_from_file(
+ vad_infer_config, vad_model_file, device
+ )
+ frontend = None
+ if vad_infer_args.frontend is not None:
+ frontend = WavFrontend(cmvn_file=vad_cmvn_file, **vad_infer_args.frontend_conf)
+
+ logging.info("vad_model: {}".format(vad_model))
+ logging.info("vad_infer_args: {}".format(vad_infer_args))
+ vad_model.to(dtype=getattr(torch, dtype)).eval()
+
+ self.vad_model = vad_model
+ self.vad_infer_args = vad_infer_args
+ self.device = device
+ self.dtype = dtype
+ self.frontend = frontend
+ self.batch_size = batch_size
+
+ @torch.no_grad()
+ def __call__(
+ self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
+ in_cache: Dict[str, torch.Tensor] = dict()
+ ) -> Tuple[List[List[int]], Dict[str, torch.Tensor]]:
+ """Inference
+
+ Args:
+ speech: Input speech data
+ Returns:
+ text, token, token_int, hyp
+
+ """
+ assert check_argument_types()
+
+ # Input as audio signal
+ if isinstance(speech, np.ndarray):
+ speech = torch.tensor(speech)
+
+ if self.frontend is not None:
+ self.frontend.filter_length_max = math.inf
+ fbanks, fbanks_len = self.frontend.forward_fbank(speech, speech_lengths)
+ feats, feats_len = self.frontend.forward_lfr_cmvn(fbanks, fbanks_len)
+ fbanks = to_device(fbanks, device=self.device)
+ feats = to_device(feats, device=self.device)
+ feats_len = feats_len.int()
+ else:
+ raise Exception("Need to extract feats first, please configure frontend configuration")
+
+ # b. Forward Encoder streaming
+ t_offset = 0
+ step = min(feats_len.max(), 6000)
+ segments = [[]] * self.batch_size
+ for t_offset in range(0, feats_len, min(step, feats_len - t_offset)):
+ if t_offset + step >= feats_len - 1:
+ step = feats_len - t_offset
+ is_final = True
+ else:
+ is_final = False
+ batch = {
+ "feats": feats[:, t_offset:t_offset + step, :],
+ "waveform": speech[:, t_offset * 160:min(speech.shape[-1], (t_offset + step - 1) * 160 + 400)],
+ "is_final": is_final,
+ "in_cache": in_cache
+ }
+ # a. To device
+ #batch = to_device(batch, device=self.device)
+ segments_part, in_cache = self.vad_model(**batch)
+ if segments_part:
+ for batch_num in range(0, self.batch_size):
+ segments[batch_num] += segments_part[batch_num]
+ return fbanks, segments
+
+class Speech2VadSegmentOnline(Speech2VadSegment):
+ """Speech2VadSegmentOnline class
+
+ Examples:
+ >>> import soundfile
+ >>> speech2segment = Speech2VadSegmentOnline("vad_config.yml", "vad.pt")
+ >>> audio, rate = soundfile.read("speech.wav")
+ >>> speech2segment(audio)
+ [[10, 230], [245, 450], ...]
+
+ """
+ def __init__(self, **kwargs):
+ super(Speech2VadSegmentOnline, self).__init__(**kwargs)
+ vad_cmvn_file = kwargs.get('vad_cmvn_file', None)
+ self.frontend = None
+ if self.vad_infer_args.frontend is not None:
+ self.frontend = WavFrontendOnline(cmvn_file=vad_cmvn_file, **self.vad_infer_args.frontend_conf)
+
+
+ @torch.no_grad()
+ def __call__(
+ self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
+ in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False, max_end_sil: int = 800
+ ) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]:
+ """Inference
+
+ Args:
+ speech: Input speech data
+ Returns:
+ text, token, token_int, hyp
+
+ """
+ assert check_argument_types()
+
+ # Input as audio signal
+ if isinstance(speech, np.ndarray):
+ speech = torch.tensor(speech)
+ batch_size = speech.shape[0]
+ segments = [[]] * batch_size
+ if self.frontend is not None:
+ feats, feats_len = self.frontend.forward(speech, speech_lengths, is_final)
+ fbanks, _ = self.frontend.get_fbank()
+ else:
+ raise Exception("Need to extract feats first, please configure frontend configuration")
+ if feats.shape[0]:
+ feats = to_device(feats, device=self.device)
+ feats_len = feats_len.int()
+ waveforms = self.frontend.get_waveforms()
+
+ batch = {
+ "feats": feats,
+ "waveform": waveforms,
+ "in_cache": in_cache,
+ "is_final": is_final,
+ "max_end_sil": max_end_sil
+ }
+ # a. To device
+ batch = to_device(batch, device=self.device)
+ segments, in_cache = self.vad_model.forward_online(**batch)
+ # in_cache.update(batch['in_cache'])
+ # in_cache = {key: value for key, value in batch['in_cache'].items()}
+ return fbanks, segments, in_cache
+
+
diff --git a/funasr/bin/vad_inference.py b/funasr/bin/vad_inference.py
deleted file mode 100644
index 5fbd8449a..000000000
--- a/funasr/bin/vad_inference.py
+++ /dev/null
@@ -1,570 +0,0 @@
-import argparse
-import logging
-import os
-import sys
-import json
-from pathlib import Path
-from typing import Any
-from typing import List
-from typing import Optional
-from typing import Sequence
-from typing import Tuple
-from typing import Union
-from typing import Dict
-
-import math
-import numpy as np
-import torch
-from typeguard import check_argument_types
-from typeguard import check_return_type
-
-from funasr.fileio.datadir_writer import DatadirWriter
-from funasr.modules.scorers.scorer_interface import BatchScorerInterface
-from funasr.modules.subsampling import TooShortUttError
-from funasr.tasks.vad import VADTask
-from funasr.torch_utils.device_funcs import to_device
-from funasr.torch_utils.set_all_random_seed import set_all_random_seed
-from funasr.utils import config_argparse
-from funasr.utils.cli_utils import get_commandline_args
-from funasr.utils.types import str2bool
-from funasr.utils.types import str2triple_str
-from funasr.utils.types import str_or_none
-from funasr.utils import asr_utils, wav_utils, postprocess_utils
-from funasr.models.frontend.wav_frontend import WavFrontend, WavFrontendOnline
-
-header_colors = '\033[95m'
-end_colors = '\033[0m'
-
-global_asr_language: str = 'zh-cn'
-global_sample_rate: Union[int, Dict[Any, int]] = {
- 'audio_fs': 16000,
- 'model_fs': 16000
-}
-
-
-class Speech2VadSegment:
- """Speech2VadSegment class
-
- Examples:
- >>> import soundfile
- >>> speech2segment = Speech2VadSegment("vad_config.yml", "vad.pt")
- >>> audio, rate = soundfile.read("speech.wav")
- >>> speech2segment(audio)
- [[10, 230], [245, 450], ...]
-
- """
-
- def __init__(
- self,
- vad_infer_config: Union[Path, str] = None,
- vad_model_file: Union[Path, str] = None,
- vad_cmvn_file: Union[Path, str] = None,
- device: str = "cpu",
- batch_size: int = 1,
- dtype: str = "float32",
- **kwargs,
- ):
- assert check_argument_types()
-
- # 1. Build vad model
- vad_model, vad_infer_args = VADTask.build_model_from_file(
- vad_infer_config, vad_model_file, device
- )
- frontend = None
- if vad_infer_args.frontend is not None:
- frontend = WavFrontend(cmvn_file=vad_cmvn_file, **vad_infer_args.frontend_conf)
-
- logging.info("vad_model: {}".format(vad_model))
- logging.info("vad_infer_args: {}".format(vad_infer_args))
- vad_model.to(dtype=getattr(torch, dtype)).eval()
-
- self.vad_model = vad_model
- self.vad_infer_args = vad_infer_args
- self.device = device
- self.dtype = dtype
- self.frontend = frontend
- self.batch_size = batch_size
-
- @torch.no_grad()
- def __call__(
- self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
- in_cache: Dict[str, torch.Tensor] = dict()
- ) -> Tuple[List[List[int]], Dict[str, torch.Tensor]]:
- """Inference
-
- Args:
- speech: Input speech data
- Returns:
- text, token, token_int, hyp
-
- """
- assert check_argument_types()
-
- # Input as audio signal
- if isinstance(speech, np.ndarray):
- speech = torch.tensor(speech)
-
- if self.frontend is not None:
- self.frontend.filter_length_max = math.inf
- fbanks, fbanks_len = self.frontend.forward_fbank(speech, speech_lengths)
- feats, feats_len = self.frontend.forward_lfr_cmvn(fbanks, fbanks_len)
- fbanks = to_device(fbanks, device=self.device)
- feats = to_device(feats, device=self.device)
- feats_len = feats_len.int()
- else:
- raise Exception("Need to extract feats first, please configure frontend configuration")
-
- # b. Forward Encoder streaming
- t_offset = 0
- step = min(feats_len.max(), 6000)
- segments = [[]] * self.batch_size
- for t_offset in range(0, feats_len, min(step, feats_len - t_offset)):
- if t_offset + step >= feats_len - 1:
- step = feats_len - t_offset
- is_final = True
- else:
- is_final = False
- batch = {
- "feats": feats[:, t_offset:t_offset + step, :],
- "waveform": speech[:, t_offset * 160:min(speech.shape[-1], (t_offset + step - 1) * 160 + 400)],
- "is_final": is_final,
- "in_cache": in_cache
- }
- # a. To device
- #batch = to_device(batch, device=self.device)
- segments_part, in_cache = self.vad_model(**batch)
- if segments_part:
- for batch_num in range(0, self.batch_size):
- segments[batch_num] += segments_part[batch_num]
- return fbanks, segments
-
-class Speech2VadSegmentOnline(Speech2VadSegment):
- """Speech2VadSegmentOnline class
-
- Examples:
- >>> import soundfile
- >>> speech2segment = Speech2VadSegmentOnline("vad_config.yml", "vad.pt")
- >>> audio, rate = soundfile.read("speech.wav")
- >>> speech2segment(audio)
- [[10, 230], [245, 450], ...]
-
- """
- def __init__(self, **kwargs):
- super(Speech2VadSegmentOnline, self).__init__(**kwargs)
- vad_cmvn_file = kwargs.get('vad_cmvn_file', None)
- self.frontend = None
- if self.vad_infer_args.frontend is not None:
- self.frontend = WavFrontendOnline(cmvn_file=vad_cmvn_file, **self.vad_infer_args.frontend_conf)
-
-
- @torch.no_grad()
- def __call__(
- self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
- in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False, max_end_sil: int = 800
- ) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]:
- """Inference
-
- Args:
- speech: Input speech data
- Returns:
- text, token, token_int, hyp
-
- """
- assert check_argument_types()
-
- # Input as audio signal
- if isinstance(speech, np.ndarray):
- speech = torch.tensor(speech)
- batch_size = speech.shape[0]
- segments = [[]] * batch_size
- if self.frontend is not None:
- feats, feats_len = self.frontend.forward(speech, speech_lengths, is_final)
- fbanks, _ = self.frontend.get_fbank()
- else:
- raise Exception("Need to extract feats first, please configure frontend configuration")
- if feats.shape[0]:
- feats = to_device(feats, device=self.device)
- feats_len = feats_len.int()
- waveforms = self.frontend.get_waveforms()
-
- batch = {
- "feats": feats,
- "waveform": waveforms,
- "in_cache": in_cache,
- "is_final": is_final,
- "max_end_sil": max_end_sil
- }
- # a. To device
- batch = to_device(batch, device=self.device)
- segments, in_cache = self.vad_model.forward_online(**batch)
- # in_cache.update(batch['in_cache'])
- # in_cache = {key: value for key, value in batch['in_cache'].items()}
- return fbanks, segments, in_cache
-
-
-def inference(
- batch_size: int,
- ngpu: int,
- log_level: Union[int, str],
- data_path_and_name_and_type,
- vad_infer_config: Optional[str],
- vad_model_file: Optional[str],
- vad_cmvn_file: Optional[str] = None,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- key_file: Optional[str] = None,
- allow_variable_data_keys: bool = False,
- output_dir: Optional[str] = None,
- dtype: str = "float32",
- seed: int = 0,
- num_workers: int = 1,
- online: bool = False,
- **kwargs,
-):
- if not online:
- inference_pipeline = inference_modelscope(
- batch_size=batch_size,
- ngpu=ngpu,
- log_level=log_level,
- vad_infer_config=vad_infer_config,
- vad_model_file=vad_model_file,
- vad_cmvn_file=vad_cmvn_file,
- key_file=key_file,
- allow_variable_data_keys=allow_variable_data_keys,
- output_dir=output_dir,
- dtype=dtype,
- seed=seed,
- num_workers=num_workers,
- **kwargs,
- )
- else:
- inference_pipeline = inference_modelscope_online(
- batch_size=batch_size,
- ngpu=ngpu,
- log_level=log_level,
- vad_infer_config=vad_infer_config,
- vad_model_file=vad_model_file,
- vad_cmvn_file=vad_cmvn_file,
- key_file=key_file,
- allow_variable_data_keys=allow_variable_data_keys,
- output_dir=output_dir,
- dtype=dtype,
- seed=seed,
- num_workers=num_workers,
- **kwargs,
- )
- return inference_pipeline(data_path_and_name_and_type, raw_inputs)
-
-def inference_modelscope(
- batch_size: int,
- ngpu: int,
- log_level: Union[int, str],
- # data_path_and_name_and_type,
- vad_infer_config: Optional[str],
- vad_model_file: Optional[str],
- vad_cmvn_file: Optional[str] = None,
- # raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- key_file: Optional[str] = None,
- allow_variable_data_keys: bool = False,
- output_dir: Optional[str] = None,
- dtype: str = "float32",
- seed: int = 0,
- num_workers: int = 1,
- **kwargs,
-):
- assert check_argument_types()
- if batch_size > 1:
- raise NotImplementedError("batch decoding is not implemented")
-
-
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
-
- if ngpu >= 1 and torch.cuda.is_available():
- device = "cuda"
- else:
- device = "cpu"
- batch_size = 1
- # 1. Set random-seed
- set_all_random_seed(seed)
-
- # 2. Build speech2vadsegment
- speech2vadsegment_kwargs = dict(
- vad_infer_config=vad_infer_config,
- vad_model_file=vad_model_file,
- vad_cmvn_file=vad_cmvn_file,
- device=device,
- dtype=dtype,
- )
- logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs))
- speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs)
-
- def _forward(
- data_path_and_name_and_type,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- output_dir_v2: Optional[str] = None,
- fs: dict = None,
- param_dict: dict = None
- ):
- # 3. Build data-iterator
- if data_path_and_name_and_type is None and raw_inputs is not None:
- if isinstance(raw_inputs, torch.Tensor):
- raw_inputs = raw_inputs.numpy()
- data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
- loader = VADTask.build_streaming_iterator(
- data_path_and_name_and_type,
- dtype=dtype,
- batch_size=batch_size,
- key_file=key_file,
- num_workers=num_workers,
- preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False),
- collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False),
- allow_variable_data_keys=allow_variable_data_keys,
- inference=True,
- )
-
- finish_count = 0
- file_count = 1
- # 7 .Start for-loop
- # FIXME(kamo): The output format should be discussed about
- output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
- if output_path is not None:
- writer = DatadirWriter(output_path)
- ibest_writer = writer[f"1best_recog"]
- else:
- writer = None
- ibest_writer = None
-
- vad_results = []
- for keys, batch in loader:
- assert isinstance(batch, dict), type(batch)
- assert all(isinstance(s, str) for s in keys), keys
- _bs = len(next(iter(batch.values())))
- assert len(keys) == _bs, f"{len(keys)} != {_bs}"
-
- # do vad segment
- _, results = speech2vadsegment(**batch)
- for i, _ in enumerate(keys):
- if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas":
- results[i] = json.dumps(results[i])
- item = {'key': keys[i], 'value': results[i]}
- vad_results.append(item)
- if writer is not None:
- ibest_writer["text"][keys[i]] = "{}".format(results[i])
-
- return vad_results
-
- return _forward
-
-def inference_modelscope_online(
- batch_size: int,
- ngpu: int,
- log_level: Union[int, str],
- # data_path_and_name_and_type,
- vad_infer_config: Optional[str],
- vad_model_file: Optional[str],
- vad_cmvn_file: Optional[str] = None,
- # raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- key_file: Optional[str] = None,
- allow_variable_data_keys: bool = False,
- output_dir: Optional[str] = None,
- dtype: str = "float32",
- seed: int = 0,
- num_workers: int = 1,
- **kwargs,
-):
- assert check_argument_types()
-
-
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
-
- if ngpu >= 1 and torch.cuda.is_available():
- device = "cuda"
- else:
- device = "cpu"
- batch_size = 1
-
- # 1. Set random-seed
- set_all_random_seed(seed)
-
- # 2. Build speech2vadsegment
- speech2vadsegment_kwargs = dict(
- vad_infer_config=vad_infer_config,
- vad_model_file=vad_model_file,
- vad_cmvn_file=vad_cmvn_file,
- device=device,
- dtype=dtype,
- )
- logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs))
- speech2vadsegment = Speech2VadSegmentOnline(**speech2vadsegment_kwargs)
-
- def _forward(
- data_path_and_name_and_type,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- output_dir_v2: Optional[str] = None,
- fs: dict = None,
- param_dict: dict = None,
- ):
- # 3. Build data-iterator
- if data_path_and_name_and_type is None and raw_inputs is not None:
- if isinstance(raw_inputs, torch.Tensor):
- raw_inputs = raw_inputs.numpy()
- data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
- loader = VADTask.build_streaming_iterator(
- data_path_and_name_and_type,
- dtype=dtype,
- batch_size=batch_size,
- key_file=key_file,
- num_workers=num_workers,
- preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False),
- collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False),
- allow_variable_data_keys=allow_variable_data_keys,
- inference=True,
- )
-
- finish_count = 0
- file_count = 1
- # 7 .Start for-loop
- # FIXME(kamo): The output format should be discussed about
- output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
- if output_path is not None:
- writer = DatadirWriter(output_path)
- ibest_writer = writer[f"1best_recog"]
- else:
- writer = None
- ibest_writer = None
-
- vad_results = []
- batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict()
- is_final = param_dict.get('is_final', False) if param_dict is not None else False
- max_end_sil = param_dict.get('max_end_sil', 800) if param_dict is not None else 800
- for keys, batch in loader:
- assert isinstance(batch, dict), type(batch)
- assert all(isinstance(s, str) for s in keys), keys
- _bs = len(next(iter(batch.values())))
- assert len(keys) == _bs, f"{len(keys)} != {_bs}"
- batch['in_cache'] = batch_in_cache
- batch['is_final'] = is_final
- batch['max_end_sil'] = max_end_sil
-
- # do vad segment
- _, results, param_dict['in_cache'] = speech2vadsegment(**batch)
- # param_dict['in_cache'] = batch['in_cache']
- if results:
- for i, _ in enumerate(keys):
- if results[i]:
- if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas":
- results[i] = json.dumps(results[i])
- item = {'key': keys[i], 'value': results[i]}
- vad_results.append(item)
- if writer is not None:
- ibest_writer["text"][keys[i]] = "{}".format(results[i])
-
- return vad_results
-
- return _forward
-
-def get_parser():
- parser = config_argparse.ArgumentParser(
- description="VAD Decoding",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- )
-
- # Note(kamo): Use '_' instead of '-' as separator.
- # '-' is confusing if written in yaml.
- parser.add_argument(
- "--log_level",
- type=lambda x: x.upper(),
- default="INFO",
- choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
- help="The verbose level of logging",
- )
-
- parser.add_argument("--output_dir", type=str, required=False)
- parser.add_argument(
- "--ngpu",
- type=int,
- default=0,
- help="The number of gpus. 0 indicates CPU mode",
- )
- parser.add_argument(
- "--gpuid_list",
- type=str,
- default="",
- help="The visible gpus",
- )
- parser.add_argument("--seed", type=int, default=0, help="Random seed")
- parser.add_argument(
- "--dtype",
- default="float32",
- choices=["float16", "float32", "float64"],
- help="Data type",
- )
- parser.add_argument(
- "--num_workers",
- type=int,
- default=1,
- help="The number of workers used for DataLoader",
- )
-
- group = parser.add_argument_group("Input data related")
- group.add_argument(
- "--data_path_and_name_and_type",
- type=str2triple_str,
- required=False,
- action="append",
- )
- group.add_argument("--raw_inputs", type=list, default=None)
- # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
- group.add_argument("--key_file", type=str_or_none)
- group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
-
- group = parser.add_argument_group("The model configuration related")
- group.add_argument(
- "--vad_infer_config",
- type=str,
- help="VAD infer configuration",
- )
- group.add_argument(
- "--vad_model_file",
- type=str,
- help="VAD model parameter file",
- )
- group.add_argument(
- "--vad_cmvn_file",
- type=str,
- help="Global cmvn file",
- )
- group.add_argument(
- "--online",
- type=str,
- help="decoding mode",
- )
-
- group = parser.add_argument_group("infer related")
- group.add_argument(
- "--batch_size",
- type=int,
- default=1,
- help="The batch size for inference",
- )
-
- return parser
-
-
-def main(cmd=None):
- print(get_commandline_args(), file=sys.stderr)
- parser = get_parser()
- args = parser.parse_args(cmd)
- kwargs = vars(args)
- kwargs.pop("config", None)
- inference(**kwargs)
-
-
-if __name__ == "__main__":
- main()
-
diff --git a/funasr/bin/vad_inference_launch.py b/funasr/bin/vad_inference_launch.py
index de589259f..2ccc71691 100644
--- a/funasr/bin/vad_inference_launch.py
+++ b/funasr/bin/vad_inference_launch.py
@@ -17,6 +17,255 @@ from funasr.utils.types import str2bool
from funasr.utils.types import str2triple_str
from funasr.utils.types import str_or_none
+import argparse
+import logging
+import os
+import sys
+import json
+from pathlib import Path
+from typing import Any
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+from typing import Dict
+
+import math
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from funasr.fileio.datadir_writer import DatadirWriter
+from funasr.modules.scorers.scorer_interface import BatchScorerInterface
+from funasr.modules.subsampling import TooShortUttError
+from funasr.tasks.vad import VADTask
+from funasr.torch_utils.device_funcs import to_device
+from funasr.torch_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import config_argparse
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.utils.types import str2bool
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+from funasr.utils import asr_utils, wav_utils, postprocess_utils
+from funasr.models.frontend.wav_frontend import WavFrontend, WavFrontendOnline
+from funasr.bin.vad_infer import Speech2VadSegment, Speech2VadSegmentOnline
+
+def inference_vad(
+ batch_size: int,
+ ngpu: int,
+ log_level: Union[int, str],
+ # data_path_and_name_and_type,
+ vad_infer_config: Optional[str],
+ vad_model_file: Optional[str],
+ vad_cmvn_file: Optional[str] = None,
+ # raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ key_file: Optional[str] = None,
+ allow_variable_data_keys: bool = False,
+ output_dir: Optional[str] = None,
+ dtype: str = "float32",
+ seed: int = 0,
+ num_workers: int = 1,
+ **kwargs,
+):
+ assert check_argument_types()
+ if batch_size > 1:
+ raise NotImplementedError("batch decoding is not implemented")
+
+
+ logging.basicConfig(
+ level=log_level,
+ format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+ )
+
+ if ngpu >= 1 and torch.cuda.is_available():
+ device = "cuda"
+ else:
+ device = "cpu"
+ batch_size = 1
+ # 1. Set random-seed
+ set_all_random_seed(seed)
+
+ # 2. Build speech2vadsegment
+ speech2vadsegment_kwargs = dict(
+ vad_infer_config=vad_infer_config,
+ vad_model_file=vad_model_file,
+ vad_cmvn_file=vad_cmvn_file,
+ device=device,
+ dtype=dtype,
+ )
+ logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs))
+ speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs)
+
+ def _forward(
+ data_path_and_name_and_type,
+ raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ output_dir_v2: Optional[str] = None,
+ fs: dict = None,
+ param_dict: dict = None
+ ):
+ # 3. Build data-iterator
+ if data_path_and_name_and_type is None and raw_inputs is not None:
+ if isinstance(raw_inputs, torch.Tensor):
+ raw_inputs = raw_inputs.numpy()
+ data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+ loader = VADTask.build_streaming_iterator(
+ data_path_and_name_and_type,
+ dtype=dtype,
+ batch_size=batch_size,
+ key_file=key_file,
+ num_workers=num_workers,
+ preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False),
+ collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False),
+ allow_variable_data_keys=allow_variable_data_keys,
+ inference=True,
+ )
+
+ finish_count = 0
+ file_count = 1
+ # 7 .Start for-loop
+ # FIXME(kamo): The output format should be discussed about
+ output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
+ if output_path is not None:
+ writer = DatadirWriter(output_path)
+ ibest_writer = writer[f"1best_recog"]
+ else:
+ writer = None
+ ibest_writer = None
+
+ vad_results = []
+ for keys, batch in loader:
+ assert isinstance(batch, dict), type(batch)
+ assert all(isinstance(s, str) for s in keys), keys
+ _bs = len(next(iter(batch.values())))
+ assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+
+ # do vad segment
+ _, results = speech2vadsegment(**batch)
+ for i, _ in enumerate(keys):
+ if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas":
+ results[i] = json.dumps(results[i])
+ item = {'key': keys[i], 'value': results[i]}
+ vad_results.append(item)
+ if writer is not None:
+ ibest_writer["text"][keys[i]] = "{}".format(results[i])
+
+ return vad_results
+
+ return _forward
+
+def inference_vad_online(
+ batch_size: int,
+ ngpu: int,
+ log_level: Union[int, str],
+ # data_path_and_name_and_type,
+ vad_infer_config: Optional[str],
+ vad_model_file: Optional[str],
+ vad_cmvn_file: Optional[str] = None,
+ # raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ key_file: Optional[str] = None,
+ allow_variable_data_keys: bool = False,
+ output_dir: Optional[str] = None,
+ dtype: str = "float32",
+ seed: int = 0,
+ num_workers: int = 1,
+ **kwargs,
+):
+ assert check_argument_types()
+
+
+ logging.basicConfig(
+ level=log_level,
+ format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+ )
+
+ if ngpu >= 1 and torch.cuda.is_available():
+ device = "cuda"
+ else:
+ device = "cpu"
+ batch_size = 1
+
+ # 1. Set random-seed
+ set_all_random_seed(seed)
+
+ # 2. Build speech2vadsegment
+ speech2vadsegment_kwargs = dict(
+ vad_infer_config=vad_infer_config,
+ vad_model_file=vad_model_file,
+ vad_cmvn_file=vad_cmvn_file,
+ device=device,
+ dtype=dtype,
+ )
+ logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs))
+ speech2vadsegment = Speech2VadSegmentOnline(**speech2vadsegment_kwargs)
+
+ def _forward(
+ data_path_and_name_and_type,
+ raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ output_dir_v2: Optional[str] = None,
+ fs: dict = None,
+ param_dict: dict = None,
+ ):
+ # 3. Build data-iterator
+ if data_path_and_name_and_type is None and raw_inputs is not None:
+ if isinstance(raw_inputs, torch.Tensor):
+ raw_inputs = raw_inputs.numpy()
+ data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+ loader = VADTask.build_streaming_iterator(
+ data_path_and_name_and_type,
+ dtype=dtype,
+ batch_size=batch_size,
+ key_file=key_file,
+ num_workers=num_workers,
+ preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False),
+ collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False),
+ allow_variable_data_keys=allow_variable_data_keys,
+ inference=True,
+ )
+
+ finish_count = 0
+ file_count = 1
+ # 7 .Start for-loop
+ # FIXME(kamo): The output format should be discussed about
+ output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
+ if output_path is not None:
+ writer = DatadirWriter(output_path)
+ ibest_writer = writer[f"1best_recog"]
+ else:
+ writer = None
+ ibest_writer = None
+
+ vad_results = []
+ batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict()
+ is_final = param_dict.get('is_final', False) if param_dict is not None else False
+ max_end_sil = param_dict.get('max_end_sil', 800) if param_dict is not None else 800
+ for keys, batch in loader:
+ assert isinstance(batch, dict), type(batch)
+ assert all(isinstance(s, str) for s in keys), keys
+ _bs = len(next(iter(batch.values())))
+ assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+ batch['in_cache'] = batch_in_cache
+ batch['is_final'] = is_final
+ batch['max_end_sil'] = max_end_sil
+
+ # do vad segment
+ _, results, param_dict['in_cache'] = speech2vadsegment(**batch)
+ # param_dict['in_cache'] = batch['in_cache']
+ if results:
+ for i, _ in enumerate(keys):
+ if results[i]:
+ if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas":
+ results[i] = json.dumps(results[i])
+ item = {'key': keys[i], 'value': results[i]}
+ vad_results.append(item)
+ if writer is not None:
+ ibest_writer["text"][keys[i]] = "{}".format(results[i])
+
+ return vad_results
+
+ return _forward
+
def get_parser():
parser = config_argparse.ArgumentParser(
@@ -111,11 +360,9 @@ def get_parser():
def inference_launch(mode, **kwargs):
if mode == "offline":
- from funasr.bin.vad_inference import inference_modelscope
- return inference_modelscope(**kwargs)
+ return inference_vad(**kwargs)
elif mode == "online":
- from funasr.bin.vad_inference import inference_modelscope_online
- return inference_modelscope_online(**kwargs)
+ return inference_vad_online(**kwargs)
else:
logging.info("Unknown decoding mode: {}".format(mode))
return None
@@ -147,8 +394,8 @@ def main(cmd=None):
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = gpuid
- inference_launch(**kwargs)
-
+ inference_pipeline = inference_launch(**kwargs)
+ return inference_pipeline(kwargs["data_path_and_name_and_type"])
if __name__ == "__main__":
main()
diff --git a/funasr/bin/vad_inference_online.py b/funasr/bin/vad_inference_online.py
deleted file mode 100644
index a3633093e..000000000
--- a/funasr/bin/vad_inference_online.py
+++ /dev/null
@@ -1,344 +0,0 @@
-import argparse
-import logging
-import os
-import sys
-import json
-from pathlib import Path
-from typing import Any
-from typing import List
-from typing import Optional
-from typing import Sequence
-from typing import Tuple
-from typing import Union
-from typing import Dict
-
-import numpy as np
-import torch
-from typeguard import check_argument_types
-from typeguard import check_return_type
-
-from funasr.fileio.datadir_writer import DatadirWriter
-from funasr.tasks.vad import VADTask
-from funasr.torch_utils.device_funcs import to_device
-from funasr.torch_utils.set_all_random_seed import set_all_random_seed
-from funasr.utils import config_argparse
-from funasr.utils.cli_utils import get_commandline_args
-from funasr.utils.types import str2bool
-from funasr.utils.types import str2triple_str
-from funasr.utils.types import str_or_none
-from funasr.models.frontend.wav_frontend import WavFrontendOnline
-from funasr.models.frontend.wav_frontend import WavFrontend
-from funasr.bin.vad_inference import Speech2VadSegment
-
-header_colors = '\033[95m'
-end_colors = '\033[0m'
-
-
-class Speech2VadSegmentOnline(Speech2VadSegment):
- """Speech2VadSegmentOnline class
-
- Examples:
- >>> import soundfile
- >>> speech2segment = Speech2VadSegmentOnline("vad_config.yml", "vad.pt")
- >>> audio, rate = soundfile.read("speech.wav")
- >>> speech2segment(audio)
- [[10, 230], [245, 450], ...]
-
- """
- def __init__(self, **kwargs):
- super(Speech2VadSegmentOnline, self).__init__(**kwargs)
- vad_cmvn_file = kwargs.get('vad_cmvn_file', None)
- self.frontend = None
- if self.vad_infer_args.frontend is not None:
- self.frontend = WavFrontendOnline(cmvn_file=vad_cmvn_file, **self.vad_infer_args.frontend_conf)
-
-
- @torch.no_grad()
- def __call__(
- self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
- in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False, max_end_sil: int = 800
- ) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]:
- """Inference
-
- Args:
- speech: Input speech data
- Returns:
- text, token, token_int, hyp
-
- """
- assert check_argument_types()
-
- # Input as audio signal
- if isinstance(speech, np.ndarray):
- speech = torch.tensor(speech)
- batch_size = speech.shape[0]
- segments = [[]] * batch_size
- if self.frontend is not None:
- feats, feats_len = self.frontend.forward(speech, speech_lengths, is_final)
- fbanks, _ = self.frontend.get_fbank()
- else:
- raise Exception("Need to extract feats first, please configure frontend configuration")
- if feats.shape[0]:
- feats = to_device(feats, device=self.device)
- feats_len = feats_len.int()
- waveforms = self.frontend.get_waveforms()
-
- batch = {
- "feats": feats,
- "waveform": waveforms,
- "in_cache": in_cache,
- "is_final": is_final,
- "max_end_sil": max_end_sil
- }
- # a. To device
- batch = to_device(batch, device=self.device)
- segments, in_cache = self.vad_model.forward_online(**batch)
- # in_cache.update(batch['in_cache'])
- # in_cache = {key: value for key, value in batch['in_cache'].items()}
- return fbanks, segments, in_cache
-
-
-def inference(
- batch_size: int,
- ngpu: int,
- log_level: Union[int, str],
- data_path_and_name_and_type,
- vad_infer_config: Optional[str],
- vad_model_file: Optional[str],
- vad_cmvn_file: Optional[str] = None,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- key_file: Optional[str] = None,
- allow_variable_data_keys: bool = False,
- output_dir: Optional[str] = None,
- dtype: str = "float32",
- seed: int = 0,
- num_workers: int = 1,
- **kwargs,
-):
- inference_pipeline = inference_modelscope(
- batch_size=batch_size,
- ngpu=ngpu,
- log_level=log_level,
- vad_infer_config=vad_infer_config,
- vad_model_file=vad_model_file,
- vad_cmvn_file=vad_cmvn_file,
- key_file=key_file,
- allow_variable_data_keys=allow_variable_data_keys,
- output_dir=output_dir,
- dtype=dtype,
- seed=seed,
- num_workers=num_workers,
- **kwargs,
- )
- return inference_pipeline(data_path_and_name_and_type, raw_inputs)
-
-
-def inference_modelscope(
- batch_size: int,
- ngpu: int,
- log_level: Union[int, str],
- # data_path_and_name_and_type,
- vad_infer_config: Optional[str],
- vad_model_file: Optional[str],
- vad_cmvn_file: Optional[str] = None,
- # raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- key_file: Optional[str] = None,
- allow_variable_data_keys: bool = False,
- output_dir: Optional[str] = None,
- dtype: str = "float32",
- seed: int = 0,
- num_workers: int = 1,
- **kwargs,
-):
- assert check_argument_types()
- ncpu = kwargs.get("ncpu", 1)
- torch.set_num_threads(ncpu)
-
- if batch_size > 1:
- raise NotImplementedError("batch decoding is not implemented")
-
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
-
- if ngpu >= 1 and torch.cuda.is_available():
- device = "cuda"
- else:
- device = "cpu"
- batch_size = 1
- # 1. Set random-seed
- set_all_random_seed(seed)
-
- # 2. Build speech2vadsegment
- speech2vadsegment_kwargs = dict(
- vad_infer_config=vad_infer_config,
- vad_model_file=vad_model_file,
- vad_cmvn_file=vad_cmvn_file,
- device=device,
- dtype=dtype,
- )
- logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs))
- speech2vadsegment = Speech2VadSegmentOnline(**speech2vadsegment_kwargs)
-
- def _forward(
- data_path_and_name_and_type,
- raw_inputs: Union[np.ndarray, torch.Tensor] = None,
- output_dir_v2: Optional[str] = None,
- fs: dict = None,
- param_dict: dict = None,
- ):
- # 3. Build data-iterator
- if data_path_and_name_and_type is None and raw_inputs is not None:
- if isinstance(raw_inputs, torch.Tensor):
- raw_inputs = raw_inputs.numpy()
- data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
- loader = VADTask.build_streaming_iterator(
- data_path_and_name_and_type,
- dtype=dtype,
- batch_size=batch_size,
- key_file=key_file,
- num_workers=num_workers,
- preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False),
- collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False),
- allow_variable_data_keys=allow_variable_data_keys,
- inference=True,
- )
-
- finish_count = 0
- file_count = 1
- # 7 .Start for-loop
- # FIXME(kamo): The output format should be discussed about
- output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
- if output_path is not None:
- writer = DatadirWriter(output_path)
- ibest_writer = writer[f"1best_recog"]
- else:
- writer = None
- ibest_writer = None
-
- vad_results = []
- batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict()
- is_final = param_dict.get('is_final', False) if param_dict is not None else False
- max_end_sil = param_dict.get('max_end_sil', 800) if param_dict is not None else 800
- for keys, batch in loader:
- assert isinstance(batch, dict), type(batch)
- assert all(isinstance(s, str) for s in keys), keys
- _bs = len(next(iter(batch.values())))
- assert len(keys) == _bs, f"{len(keys)} != {_bs}"
- batch['in_cache'] = batch_in_cache
- batch['is_final'] = is_final
- batch['max_end_sil'] = max_end_sil
-
- # do vad segment
- _, results, param_dict['in_cache'] = speech2vadsegment(**batch)
- # param_dict['in_cache'] = batch['in_cache']
- if results:
- for i, _ in enumerate(keys):
- if results[i]:
- if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas":
- results[i] = json.dumps(results[i])
- item = {'key': keys[i], 'value': results[i]}
- vad_results.append(item)
- if writer is not None:
- ibest_writer["text"][keys[i]] = "{}".format(results[i])
-
- return vad_results
-
- return _forward
-
-
-def get_parser():
- parser = config_argparse.ArgumentParser(
- description="VAD Decoding",
- formatter_class=argparse.ArgumentDefaultsHelpFormatter,
- )
-
- # Note(kamo): Use '_' instead of '-' as separator.
- # '-' is confusing if written in yaml.
- parser.add_argument(
- "--log_level",
- type=lambda x: x.upper(),
- default="INFO",
- choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
- help="The verbose level of logging",
- )
-
- parser.add_argument("--output_dir", type=str, required=False)
- parser.add_argument(
- "--ngpu",
- type=int,
- default=0,
- help="The number of gpus. 0 indicates CPU mode",
- )
- parser.add_argument(
- "--gpuid_list",
- type=str,
- default="",
- help="The visible gpus",
- )
- parser.add_argument("--seed", type=int, default=0, help="Random seed")
- parser.add_argument(
- "--dtype",
- default="float32",
- choices=["float16", "float32", "float64"],
- help="Data type",
- )
- parser.add_argument(
- "--num_workers",
- type=int,
- default=1,
- help="The number of workers used for DataLoader",
- )
-
- group = parser.add_argument_group("Input data related")
- group.add_argument(
- "--data_path_and_name_and_type",
- type=str2triple_str,
- required=False,
- action="append",
- )
- group.add_argument("--raw_inputs", type=list, default=None)
- # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
- group.add_argument("--key_file", type=str_or_none)
- group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
-
- group = parser.add_argument_group("The model configuration related")
- group.add_argument(
- "--vad_infer_config",
- type=str,
- help="VAD infer configuration",
- )
- group.add_argument(
- "--vad_model_file",
- type=str,
- help="VAD model parameter file",
- )
- group.add_argument(
- "--vad_cmvn_file",
- type=str,
- help="Global cmvn file",
- )
-
- group = parser.add_argument_group("infer related")
- group.add_argument(
- "--batch_size",
- type=int,
- default=1,
- help="The batch size for inference",
- )
-
- return parser
-
-
-def main(cmd=None):
- print(get_commandline_args(), file=sys.stderr)
- parser = get_parser()
- args = parser.parse_args(cmd)
- kwargs = vars(args)
- kwargs.pop("config", None)
- inference(**kwargs)
-
-
-if __name__ == "__main__":
- main()
diff --git a/funasr/version.txt b/funasr/version.txt
index 4b9fcbec1..cb0c939a9 100644
--- a/funasr/version.txt
+++ b/funasr/version.txt
@@ -1 +1 @@
-0.5.1
+0.5.2