From 41b1d35048603ba0a74e9dee5be1a1c8c76444d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=87=8C=E5=8C=80?= Date: Thu, 16 Feb 2023 17:02:56 +0800 Subject: [PATCH 1/5] update vad_inference.py --- funasr/bin/vad_inference.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/funasr/bin/vad_inference.py b/funasr/bin/vad_inference.py index 0d9659401..1cdb582e5 100644 --- a/funasr/bin/vad_inference.py +++ b/funasr/bin/vad_inference.py @@ -111,6 +111,7 @@ class Speech2VadSegment: # b. Forward Encoder sreaming segments = [] + segments_tmp = [] step = 6000 t_offset = 0 for t_offset in range(0, feats_len, min(step, feats_len - t_offset)): @@ -128,9 +129,8 @@ class Speech2VadSegment: batch = to_device(batch, device=self.device) segments_part = self.vad_model(**batch) if segments_part: - segments += segments_part - #print(segments) - + segments_tmp += segments_part[0] + segments.append(segments_tmp) return segments From 91027ddab49e5791fc42569b4db9dafca55735e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=87=8C=E5=8C=80?= Date: Thu, 16 Feb 2023 22:11:18 +0800 Subject: [PATCH 2/5] fix vad results bug --- .../README.md | 24 +++++++++++++++++++ .../speech_fsmn_vad_zh-cn-16k-common/infer.py | 15 ++++++++++++ .../speech_fsmn_vad_zh-cn-8k-common/README.md | 24 +++++++++++++++++++ .../speech_fsmn_vad_zh-cn-8k-common/infer.py | 15 ++++++++++++ funasr/bin/vad_inference.py | 11 ++++----- funasr/models/e2e_vad.py | 24 ++++--------------- funasr/tasks/vad.py | 3 +-- 7 files changed, 88 insertions(+), 28 deletions(-) create mode 100644 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md create mode 100644 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py create mode 100644 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md create mode 100644 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md new file mode 100644 index 000000000..6d9cd3024 --- /dev/null +++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md @@ -0,0 +1,24 @@ +# ModelScope Model + +## How to finetune and infer using a pretrained ModelScope Model + +### Inference + +Or you can use the finetuned model for inference directly. + +- Setting parameters in `infer.py` + - audio_in: # support wav, url, bytes, and parsed audio format. + - output_dir: # If the input format is wav.scp, it needs to be set. + +- Then you can run the pipeline to infer with: +```python + python infer.py +``` + + +Modify inference related parameters in vad.yaml. + +- max_end_silence_time: The end-point silence duration to judge the end of sentence, the parameter range is 500ms~6000ms, and the default value is 800ms +- speech_noise_thres: The balance of speech and silence scores, the parameter range is (-1,1) + - The value tends to -1, the greater probability of noise being judged as speech + - The value tends to 1, the greater probability of speech being judged as noise diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py new file mode 100644 index 000000000..c255474b8 --- /dev/null +++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py @@ -0,0 +1,15 @@ +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks + +if __name__ == '__main__': + audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav' + output_dir = None + inference_pipline = pipeline( + task=Tasks.voice_activity_detection, + model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", + model_revision=None, + output_dir=output_dir, + batch_size=1, + ) + segments_result = inference_pipline(audio_in=audio_in) + print(segments_result) diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md new file mode 100644 index 000000000..6d9cd3024 --- /dev/null +++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md @@ -0,0 +1,24 @@ +# ModelScope Model + +## How to finetune and infer using a pretrained ModelScope Model + +### Inference + +Or you can use the finetuned model for inference directly. + +- Setting parameters in `infer.py` + - audio_in: # support wav, url, bytes, and parsed audio format. + - output_dir: # If the input format is wav.scp, it needs to be set. + +- Then you can run the pipeline to infer with: +```python + python infer.py +``` + + +Modify inference related parameters in vad.yaml. + +- max_end_silence_time: The end-point silence duration to judge the end of sentence, the parameter range is 500ms~6000ms, and the default value is 800ms +- speech_noise_thres: The balance of speech and silence scores, the parameter range is (-1,1) + - The value tends to -1, the greater probability of noise being judged as speech + - The value tends to 1, the greater probability of speech being judged as noise diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py new file mode 100644 index 000000000..6061413e5 --- /dev/null +++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py @@ -0,0 +1,15 @@ +from modelscope.pipelines import pipeline +from modelscope.utils.constant import Tasks + +if __name__ == '__main__': + audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example_8k.wav' + output_dir = None + inference_pipline = pipeline( + task=Tasks.voice_activity_detection, + model="damo/speech_fsmn_vad_zh-cn-8k-common", + model_revision='v1.1.1', + output_dir='./output_dir', + batch_size=1, + ) + segments_result = inference_pipline(audio_in=audio_in) + print(segments_result) diff --git a/funasr/bin/vad_inference.py b/funasr/bin/vad_inference.py index 1cdb582e5..b0f8a77b3 100644 --- a/funasr/bin/vad_inference.py +++ b/funasr/bin/vad_inference.py @@ -81,6 +81,7 @@ class Speech2VadSegment: self.device = device self.dtype = dtype self.frontend = frontend + self.batch_size = batch_size @torch.no_grad() def __call__( @@ -110,10 +111,9 @@ class Speech2VadSegment: # segments = self.vad_model(**batch) # b. Forward Encoder sreaming - segments = [] - segments_tmp = [] - step = 6000 t_offset = 0 + step = min(feats_len, 6000) + segments = [[]] * self.batch_size for t_offset in range(0, feats_len, min(step, feats_len - t_offset)): if t_offset + step >= feats_len - 1: step = feats_len - t_offset @@ -129,8 +129,8 @@ class Speech2VadSegment: batch = to_device(batch, device=self.device) segments_part = self.vad_model(**batch) if segments_part: - segments_tmp += segments_part[0] - segments.append(segments_tmp) + for batch_num in range(0, self.batch_size): + segments[batch_num] += segments_part[batch_num] return segments @@ -254,7 +254,6 @@ def inference_modelscope( assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" - # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")} # do vad segment results = speech2vadsegment(**batch) diff --git a/funasr/models/e2e_vad.py b/funasr/models/e2e_vad.py index 8afc8db6d..b64c677f3 100755 --- a/funasr/models/e2e_vad.py +++ b/funasr/models/e2e_vad.py @@ -192,7 +192,7 @@ class WindowDetector(object): class E2EVadModel(nn.Module): - def __init__(self, encoder: FSMN, vad_post_args: Dict[str, Any], streaming=False): + def __init__(self, encoder: FSMN, vad_post_args: Dict[str, Any]): super(E2EVadModel, self).__init__() self.vad_opts = VADXOptions(**vad_post_args) self.windows_detector = WindowDetector(self.vad_opts.window_size_ms, @@ -227,7 +227,6 @@ class E2EVadModel(nn.Module): self.data_buf = None self.data_buf_all = None self.waveform = None - self.streaming = streaming self.ResetDetection() def AllResetDetection(self): @@ -451,11 +450,7 @@ class E2EVadModel(nn.Module): if not is_final_send: self.DetectCommonFrames() else: - if self.streaming: - self.DetectLastFrames() - else: - self.AllResetDetection() - self.DetectAllFrames() # offline decode and is_final_send == True + self.DetectLastFrames() segments = [] for batch_num in range(0, feats.shape[0]): # only support batch_size = 1 now segment_batch = [] @@ -468,7 +463,8 @@ class E2EVadModel(nn.Module): self.output_data_buf_offset += 1 # need update this parameter if segment_batch: segments.append(segment_batch) - + if is_final_send: + self.AllResetDetection() return segments def DetectCommonFrames(self) -> int: @@ -494,18 +490,6 @@ class E2EVadModel(nn.Module): return 0 - def DetectAllFrames(self) -> int: - if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected: - return 0 - if self.vad_opts.nn_eval_block_size != self.vad_opts.dcd_block_size: - frame_state = FrameState.kFrameStateInvalid - for t in range(0, self.frm_cnt): - frame_state = self.GetFrameState(t) - self.DetectOneFrame(frame_state, t, t == self.frm_cnt - 1) - else: - pass - return 0 - def DetectOneFrame(self, cur_frm_state: FrameState, cur_frm_idx: int, is_final_frame: bool) -> None: tmp_cur_frm_state = FrameState.kFrameStateInvalid if cur_frm_state == FrameState.kFrameStateSpeech: diff --git a/funasr/tasks/vad.py b/funasr/tasks/vad.py index e2a912394..22a5cb3d3 100644 --- a/funasr/tasks/vad.py +++ b/funasr/tasks/vad.py @@ -291,8 +291,7 @@ class VADTask(AbsTask): model_class = model_choices.get_class(args.model) except AttributeError: model_class = model_choices.get_class("e2evad") - model = model_class(encoder=encoder, vad_post_args=args.vad_post_conf, - streaming=args.encoder_conf.get('streaming', False)) + model = model_class(encoder=encoder, vad_post_args=args.vad_post_conf) return model From ff8fdd4acf0f7968992af26b9b7f3f2ae0de825b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=87=8C=E5=8C=80?= Date: Thu, 16 Feb 2023 22:12:38 +0800 Subject: [PATCH 3/5] delete speech_fsmn_vad_zh-cn-16k-common-pytorch --- .../README.md | 24 ------------------- .../infer.py | 15 ------------ 2 files changed, 39 deletions(-) delete mode 100644 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/README.md delete mode 100755 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/infer.py diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/README.md b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/README.md deleted file mode 100644 index 6d9cd3024..000000000 --- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/README.md +++ /dev/null @@ -1,24 +0,0 @@ -# ModelScope Model - -## How to finetune and infer using a pretrained ModelScope Model - -### Inference - -Or you can use the finetuned model for inference directly. - -- Setting parameters in `infer.py` - - audio_in: # support wav, url, bytes, and parsed audio format. - - output_dir: # If the input format is wav.scp, it needs to be set. - -- Then you can run the pipeline to infer with: -```python - python infer.py -``` - - -Modify inference related parameters in vad.yaml. - -- max_end_silence_time: The end-point silence duration to judge the end of sentence, the parameter range is 500ms~6000ms, and the default value is 800ms -- speech_noise_thres: The balance of speech and silence scores, the parameter range is (-1,1) - - The value tends to -1, the greater probability of noise being judged as speech - - The value tends to 1, the greater probability of speech being judged as noise diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/infer.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/infer.py deleted file mode 100755 index e11d5d21f..000000000 --- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common-pytorch/infer.py +++ /dev/null @@ -1,15 +0,0 @@ -from modelscope.pipelines import pipeline -from modelscope.utils.constant import Tasks - -if __name__ == '__main__': - audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav' - output_dir = None - inference_pipline = pipeline( - task=Tasks.auto_speech_recognition, - model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", - model_revision=None, - output_dir=output_dir, - batch_size=1, - ) - segments_result = inference_pipline(audio_in=audio_in) - print(segments_result) From ebbde50a98a4a3009df839485e58fd0ddbd4befd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=87=8C=E5=8C=80?= Date: Thu, 16 Feb 2023 23:00:14 +0800 Subject: [PATCH 4/5] support asr_inference_paraformer_vad_punc --- vad_inference.py | 364 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 364 insertions(+) create mode 100644 vad_inference.py diff --git a/vad_inference.py b/vad_inference.py new file mode 100644 index 000000000..607f131dd --- /dev/null +++ b/vad_inference.py @@ -0,0 +1,364 @@ +import argparse +import logging +import sys +import json +from pathlib import Path +from typing import Any +from typing import List +from typing import Optional +from typing import Sequence +from typing import Tuple +from typing import Union +from typing import Dict + +import numpy as np +import torch +from typeguard import check_argument_types +from typeguard import check_return_type + +from funasr.fileio.datadir_writer import DatadirWriter +from funasr.modules.scorers.scorer_interface import BatchScorerInterface +from funasr.modules.subsampling import TooShortUttError +from funasr.tasks.vad import VADTask +from funasr.torch_utils.device_funcs import to_device +from funasr.torch_utils.set_all_random_seed import set_all_random_seed +from funasr.utils import config_argparse +from funasr.utils.cli_utils import get_commandline_args +from funasr.utils.types import str2bool +from funasr.utils.types import str2triple_str +from funasr.utils.types import str_or_none +from funasr.utils import asr_utils, wav_utils, postprocess_utils +from funasr.models.frontend.wav_frontend import WavFrontend + +header_colors = '\033[95m' +end_colors = '\033[0m' + +global_asr_language: str = 'zh-cn' +global_sample_rate: Union[int, Dict[Any, int]] = { + 'audio_fs': 16000, + 'model_fs': 16000 +} + + +class Speech2VadSegment: + """Speech2VadSegment class + + Examples: + >>> import soundfile + >>> speech2segment = Speech2VadSegment("vad_config.yml", "vad.pt") + >>> audio, rate = soundfile.read("speech.wav") + >>> speech2segment(audio) + [[10, 230], [245, 450], ...] + + """ + + def __init__( + self, + vad_infer_config: Union[Path, str] = None, + vad_model_file: Union[Path, str] = None, + vad_cmvn_file: Union[Path, str] = None, + device: str = "cpu", + batch_size: int = 1, + dtype: str = "float32", + **kwargs, + ): + assert check_argument_types() + + # 1. Build vad model + vad_model, vad_infer_args = VADTask.build_model_from_file( + vad_infer_config, vad_model_file, device + ) + frontend = None + if vad_infer_args.frontend is not None: + frontend = WavFrontend(cmvn_file=vad_cmvn_file, **vad_infer_args.frontend_conf) + + logging.info("vad_model: {}".format(vad_model)) + logging.info("vad_infer_args: {}".format(vad_infer_args)) + vad_model.to(dtype=getattr(torch, dtype)).eval() + + self.vad_model = vad_model + self.vad_infer_args = vad_infer_args + self.device = device + self.dtype = dtype + self.frontend = frontend + self.batch_size = batch_size + + @torch.no_grad() + def __call__( + self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None + ) -> List[List[int]]: + """Inference + + Args: + speech: Input speech data + Returns: + text, token, token_int, hyp + + """ + assert check_argument_types() + + # Input as audio signal + if isinstance(speech, np.ndarray): + speech = torch.tensor(speech) + + if self.frontend is not None: + feats, feats_len = self.frontend.forward(speech, speech_lengths) + feats = to_device(feats, device=self.device) + feats_len = feats_len.int() + else: + raise Exception("Need to extract feats first, please configure frontend configuration") + + # b. Forward Encoder streaming + t_offset = 0 + step = min(feats_len, 6000) + segments = [[]] * self.batch_size + for t_offset in range(0, feats_len, min(step, feats_len - t_offset)): + if t_offset + step >= feats_len - 1: + step = feats_len - t_offset + is_final_send = True + else: + is_final_send = False + batch = { + "feats": feats[:, t_offset:t_offset + step, :], + "waveform": speech[:, t_offset * 160:min(speech.shape[-1], (t_offset + step - 1) * 160 + 400)], + "is_final_send": is_final_send + } + # a. To device + batch = to_device(batch, device=self.device) + segments_part = self.vad_model(**batch) + if segments_part: + for batch_num in range(0, self.batch_size): + segments[batch_num] += segments_part[batch_num] + return segments + + +def inference( + batch_size: int, + ngpu: int, + log_level: Union[int, str], + data_path_and_name_and_type, + vad_infer_config: Optional[str], + vad_model_file: Optional[str], + vad_cmvn_file: Optional[str] = None, + raw_inputs: Union[np.ndarray, torch.Tensor] = None, + key_file: Optional[str] = None, + allow_variable_data_keys: bool = False, + output_dir: Optional[str] = None, + dtype: str = "float32", + seed: int = 0, + num_workers: int = 1, + **kwargs, +): + inference_pipeline = inference_modelscope( + batch_size=batch_size, + ngpu=ngpu, + log_level=log_level, + vad_infer_config=vad_infer_config, + vad_model_file=vad_model_file, + vad_cmvn_file=vad_cmvn_file, + key_file=key_file, + allow_variable_data_keys=allow_variable_data_keys, + output_dir=output_dir, + dtype=dtype, + seed=seed, + num_workers=num_workers, + **kwargs, + ) + return inference_pipeline(data_path_and_name_and_type, raw_inputs) + + +def inference_modelscope( + batch_size: int, + ngpu: int, + log_level: Union[int, str], + # data_path_and_name_and_type, + vad_infer_config: Optional[str], + vad_model_file: Optional[str], + vad_cmvn_file: Optional[str] = None, + # raw_inputs: Union[np.ndarray, torch.Tensor] = None, + key_file: Optional[str] = None, + allow_variable_data_keys: bool = False, + output_dir: Optional[str] = None, + dtype: str = "float32", + seed: int = 0, + num_workers: int = 1, + **kwargs, +): + assert check_argument_types() + if batch_size > 1: + raise NotImplementedError("batch decoding is not implemented") + if ngpu > 1: + raise NotImplementedError("only single GPU decoding is supported") + + logging.basicConfig( + level=log_level, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + + if ngpu >= 1 and torch.cuda.is_available(): + device = "cuda" + else: + device = "cpu" + + # 1. Set random-seed + set_all_random_seed(seed) + + # 2. Build speech2vadsegment + speech2vadsegment_kwargs = dict( + vad_infer_config=vad_infer_config, + vad_model_file=vad_model_file, + vad_cmvn_file=vad_cmvn_file, + device=device, + dtype=dtype, + ) + logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs)) + speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs) + + def _forward( + data_path_and_name_and_type, + raw_inputs: Union[np.ndarray, torch.Tensor] = None, + output_dir_v2: Optional[str] = None, + fs: dict = None, + param_dict: dict = None, + ): + # 3. Build data-iterator + loader = VADTask.build_streaming_iterator( + data_path_and_name_and_type, + dtype=dtype, + batch_size=batch_size, + key_file=key_file, + num_workers=num_workers, + preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False), + collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False), + allow_variable_data_keys=allow_variable_data_keys, + inference=True, + ) + + finish_count = 0 + file_count = 1 + # 7 .Start for-loop + # FIXME(kamo): The output format should be discussed about + output_path = output_dir_v2 if output_dir_v2 is not None else output_dir + if output_path is not None: + writer = DatadirWriter(output_path) + ibest_writer = writer[f"1best_recog"] + else: + writer = None + ibest_writer = None + + vad_results = [] + for keys, batch in loader: + assert isinstance(batch, dict), type(batch) + assert all(isinstance(s, str) for s in keys), keys + _bs = len(next(iter(batch.values()))) + assert len(keys) == _bs, f"{len(keys)} != {_bs}" + + # do vad segment + results = speech2vadsegment(**batch) + for i, _ in enumerate(keys): + results[i] = json.dumps(results[i]) + item = {'key': keys[i], 'value': results[i]} + vad_results.append(item) + if writer is not None: + results[i] = json.loads(results[i]) + ibest_writer["text"][keys[i]] = "{}".format(results[i]) + + return vad_results + + return _forward + + +def get_parser(): + parser = config_argparse.ArgumentParser( + description="VAD Decoding", + formatter_class=argparse.ArgumentDefaultsHelpFormatter, + ) + + # Note(kamo): Use '_' instead of '-' as separator. + # '-' is confusing if written in yaml. + parser.add_argument( + "--log_level", + type=lambda x: x.upper(), + default="INFO", + choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), + help="The verbose level of logging", + ) + + parser.add_argument("--output_dir", type=str, required=False) + parser.add_argument( + "--ngpu", + type=int, + default=0, + help="The number of gpus. 0 indicates CPU mode", + ) + parser.add_argument( + "--gpuid_list", + type=str, + default="", + help="The visible gpus", + ) + parser.add_argument("--seed", type=int, default=0, help="Random seed") + parser.add_argument( + "--dtype", + default="float32", + choices=["float16", "float32", "float64"], + help="Data type", + ) + parser.add_argument( + "--num_workers", + type=int, + default=1, + help="The number of workers used for DataLoader", + ) + + group = parser.add_argument_group("Input data related") + group.add_argument( + "--data_path_and_name_and_type", + type=str2triple_str, + required=False, + action="append", + ) + group.add_argument("--raw_inputs", type=list, default=None) + # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}]) + group.add_argument("--key_file", type=str_or_none) + group.add_argument("--allow_variable_data_keys", type=str2bool, default=False) + + group = parser.add_argument_group("The model configuration related") + group.add_argument( + "--vad_infer_config", + type=str, + help="VAD infer configuration", + ) + group.add_argument( + "--vad_model_file", + type=str, + help="VAD model parameter file", + ) + group.add_argument( + "--vad_cmvn_file", + type=str, + help="Global cmvn file", + ) + + group = parser.add_argument_group("infer related") + group.add_argument( + "--batch_size", + type=int, + default=1, + help="The batch size for inference", + ) + + return parser + + +def main(cmd=None): + print(get_commandline_args(), file=sys.stderr) + parser = get_parser() + args = parser.parse_args(cmd) + kwargs = vars(args) + kwargs.pop("config", None) + inference(**kwargs) + + +if __name__ == "__main__": + main() From 8689fb676d6cd28894f55c8cac43409e7bf4cd38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=87=8C=E5=8C=80?= Date: Thu, 16 Feb 2023 23:08:56 +0800 Subject: [PATCH 5/5] support asr_inference_paraformer_vad_punc --- .../bin/asr_inference_paraformer_vad_punc.py | 240 ++++++------ funasr/bin/vad_inference.py | 4 +- vad_inference.py | 364 ------------------ 3 files changed, 131 insertions(+), 477 deletions(-) delete mode 100644 vad_inference.py diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py index c4bb61bd1..ee361351a 100644 --- a/funasr/bin/asr_inference_paraformer_vad_punc.py +++ b/funasr/bin/asr_inference_paraformer_vad_punc.py @@ -144,7 +144,7 @@ class Speech2Text: for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() - + logging.info(f"Decoding device={device}, dtype={dtype}") # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text @@ -184,12 +184,11 @@ class Speech2Text: self.encoder_downsampling_factor = 1 if asr_train_args.encoder_conf["input_layer"] == "conv2d": self.encoder_downsampling_factor = 4 - - @torch.no_grad() def __call__( - self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None, begin_time: int = 0, end_time: int = None, + self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None, + begin_time: int = 0, end_time: int = None, ): """Inference @@ -215,7 +214,7 @@ class Speech2Text: else: feats = speech feats_len = speech_lengths - lfr_factor = max(1, (feats.size()[-1]//80)-1) + lfr_factor = max(1, (feats.size()[-1] // 80) - 1) batch = {"speech": feats, "speech_lengths": feats_len} # a. To device @@ -229,7 +228,8 @@ class Speech2Text: enc_len_batch_total = torch.sum(enc_len).item() * self.encoder_downsampling_factor predictor_outs = self.asr_model.calc_predictor(enc, enc_len) - pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], predictor_outs[2], predictor_outs[3] + pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \ + predictor_outs[2], predictor_outs[3] pre_token_length = pre_token_length.round().long() if torch.max(pre_token_length) < 1: return [] @@ -249,7 +249,7 @@ class Speech2Text: nbest_hyps = self.beam_search( x=x, am_scores=am_scores, maxlenratio=self.maxlenratio, minlenratio=self.minlenratio ) - + nbest_hyps = nbest_hyps[: self.nbest] else: yseq = am_scores.argmax(dim=-1) @@ -260,23 +260,23 @@ class Speech2Text: [self.asr_model.sos] + yseq.tolist() + [self.asr_model.eos], device=yseq.device ) nbest_hyps = [Hypothesis(yseq=yseq, score=score)] - + for hyp in nbest_hyps: assert isinstance(hyp, (Hypothesis)), type(hyp) - + # remove sos/eos and get results last_pos = -1 if isinstance(hyp.yseq, list): token_int = hyp.yseq[1:last_pos] else: token_int = hyp.yseq[1:last_pos].tolist() - + # remove blank symbol id, which is assumed to be 0 token_int = list(filter(lambda x: x != 0 and x != 2, token_int)) - + # Change integer-ids to tokens token = self.converter.ids2tokens(token_int) - + if self.tokenizer is not None: text = self.tokenizer.tokens2text(token) else: @@ -286,12 +286,14 @@ class Speech2Text: timestamp = time_stamp_lfr6_pl(us_alphas[i], us_cif_peak[i], copy.copy(token), begin_time, end_time) results.append((text, token, token_int, timestamp, enc_len_batch_total, lfr_factor)) else: - time_stamp = time_stamp_lfr6(alphas[i:i + 1, ], enc_len[i:i + 1, ], copy.copy(token), begin_time, end_time) + time_stamp = time_stamp_lfr6(alphas[i:i + 1, ], enc_len[i:i + 1, ], copy.copy(token), begin_time, + end_time) results.append((text, token, token_int, time_stamp, enc_len_batch_total, lfr_factor)) # assert check_return_type(results) return results + class Speech2VadSegment: """Speech2VadSegment class @@ -333,6 +335,7 @@ class Speech2VadSegment: self.device = device self.dtype = dtype self.frontend = frontend + self.batch_size = batch_size @torch.no_grad() def __call__( @@ -361,56 +364,69 @@ class Speech2VadSegment: feats_len = feats_len.int() else: raise Exception("Need to extract feats first, please configure frontend configuration") - batch = {"feats": feats, "feats_lengths": feats_len, "waveform": speech} - # a. To device - batch = to_device(batch, device=self.device) - - # b. Forward Encoder - segments = self.vad_model(**batch) + # b. Forward Encoder streaming + t_offset = 0 + step = min(feats_len, 6000) + segments = [[]] * self.batch_size + for t_offset in range(0, feats_len, min(step, feats_len - t_offset)): + if t_offset + step >= feats_len - 1: + step = feats_len - t_offset + is_final_send = True + else: + is_final_send = False + batch = { + "feats": feats[:, t_offset:t_offset + step, :], + "waveform": speech[:, t_offset * 160:min(speech.shape[-1], (t_offset + step - 1) * 160 + 400)], + "is_final_send": is_final_send + } + # a. To device + batch = to_device(batch, device=self.device) + segments_part = self.vad_model(**batch) + if segments_part: + for batch_num in range(0, self.batch_size): + segments[batch_num] += segments_part[batch_num] return fbanks, segments - def inference( - maxlenratio: float, - minlenratio: float, - batch_size: int, - beam_size: int, - ngpu: int, - ctc_weight: float, - lm_weight: float, - penalty: float, - log_level: Union[int, str], - data_path_and_name_and_type, - asr_train_config: Optional[str], - asr_model_file: Optional[str], - cmvn_file: Optional[str] = None, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - lm_train_config: Optional[str] = None, - lm_file: Optional[str] = None, - token_type: Optional[str] = None, - key_file: Optional[str] = None, - word_lm_train_config: Optional[str] = None, - bpemodel: Optional[str] = None, - allow_variable_data_keys: bool = False, - streaming: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - ngram_weight: float = 0.9, - nbest: int = 1, - num_workers: int = 1, - vad_infer_config: Optional[str] = None, - vad_model_file: Optional[str] = None, - vad_cmvn_file: Optional[str] = None, - time_stamp_writer: bool = False, - punc_infer_config: Optional[str] = None, - punc_model_file: Optional[str] = None, - **kwargs, + maxlenratio: float, + minlenratio: float, + batch_size: int, + beam_size: int, + ngpu: int, + ctc_weight: float, + lm_weight: float, + penalty: float, + log_level: Union[int, str], + data_path_and_name_and_type, + asr_train_config: Optional[str], + asr_model_file: Optional[str], + cmvn_file: Optional[str] = None, + raw_inputs: Union[np.ndarray, torch.Tensor] = None, + lm_train_config: Optional[str] = None, + lm_file: Optional[str] = None, + token_type: Optional[str] = None, + key_file: Optional[str] = None, + word_lm_train_config: Optional[str] = None, + bpemodel: Optional[str] = None, + allow_variable_data_keys: bool = False, + streaming: bool = False, + output_dir: Optional[str] = None, + dtype: str = "float32", + seed: int = 0, + ngram_weight: float = 0.9, + nbest: int = 1, + num_workers: int = 1, + vad_infer_config: Optional[str] = None, + vad_model_file: Optional[str] = None, + vad_cmvn_file: Optional[str] = None, + time_stamp_writer: bool = False, + punc_infer_config: Optional[str] = None, + punc_model_file: Optional[str] = None, + **kwargs, ): - inference_pipeline = inference_modelscope( maxlenratio=maxlenratio, minlenratio=minlenratio, @@ -449,63 +465,64 @@ def inference( ) return inference_pipeline(data_path_and_name_and_type, raw_inputs) + def inference_modelscope( - maxlenratio: float, - minlenratio: float, - batch_size: int, - beam_size: int, - ngpu: int, - ctc_weight: float, - lm_weight: float, - penalty: float, - log_level: Union[int, str], - # data_path_and_name_and_type, - asr_train_config: Optional[str], - asr_model_file: Optional[str], - cmvn_file: Optional[str] = None, - lm_train_config: Optional[str] = None, - lm_file: Optional[str] = None, - token_type: Optional[str] = None, - key_file: Optional[str] = None, - word_lm_train_config: Optional[str] = None, - bpemodel: Optional[str] = None, - allow_variable_data_keys: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - ngram_weight: float = 0.9, - nbest: int = 1, - num_workers: int = 1, - vad_infer_config: Optional[str] = None, - vad_model_file: Optional[str] = None, - vad_cmvn_file: Optional[str] = None, - time_stamp_writer: bool = True, - punc_infer_config: Optional[str] = None, - punc_model_file: Optional[str] = None, - outputs_dict: Optional[bool] = True, - param_dict: dict = None, - **kwargs, + maxlenratio: float, + minlenratio: float, + batch_size: int, + beam_size: int, + ngpu: int, + ctc_weight: float, + lm_weight: float, + penalty: float, + log_level: Union[int, str], + # data_path_and_name_and_type, + asr_train_config: Optional[str], + asr_model_file: Optional[str], + cmvn_file: Optional[str] = None, + lm_train_config: Optional[str] = None, + lm_file: Optional[str] = None, + token_type: Optional[str] = None, + key_file: Optional[str] = None, + word_lm_train_config: Optional[str] = None, + bpemodel: Optional[str] = None, + allow_variable_data_keys: bool = False, + output_dir: Optional[str] = None, + dtype: str = "float32", + seed: int = 0, + ngram_weight: float = 0.9, + nbest: int = 1, + num_workers: int = 1, + vad_infer_config: Optional[str] = None, + vad_model_file: Optional[str] = None, + vad_cmvn_file: Optional[str] = None, + time_stamp_writer: bool = True, + punc_infer_config: Optional[str] = None, + punc_model_file: Optional[str] = None, + outputs_dict: Optional[bool] = True, + param_dict: dict = None, + **kwargs, ): assert check_argument_types() - + if word_lm_train_config is not None: raise NotImplementedError("Word LM is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") - + logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) - + if ngpu >= 1 and torch.cuda.is_available(): device = "cuda" else: device = "cpu" - + # 1. Set random-seed set_all_random_seed(seed) - + # 2. Build speech2vadsegment speech2vadsegment_kwargs = dict( vad_infer_config=vad_infer_config, @@ -516,7 +533,7 @@ def inference_modelscope( ) # logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs)) speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs) - + # 3. Build speech2text speech2text_kwargs = dict( asr_train_config=asr_train_config, @@ -539,14 +556,14 @@ def inference_modelscope( ) speech2text = Speech2Text(**speech2text_kwargs) text2punc = None - if punc_model_file is not None: + if punc_model_file is not None: text2punc = Text2Punc(punc_infer_config, punc_model_file, device=device, dtype=dtype) if output_dir is not None: writer = DatadirWriter(output_dir) ibest_writer = writer[f"1best_recog"] ibest_writer["token_list"][""] = " ".join(speech2text.asr_train_args.token_list) - + def _forward(data_path_and_name_and_type, raw_inputs: Union[np.ndarray, torch.Tensor] = None, output_dir_v2: Optional[str] = None, @@ -575,7 +592,7 @@ def inference_modelscope( use_timestamp = param_dict.get('use_timestamp', True) else: use_timestamp = True - + finish_count = 0 file_count = 1 lfr_factor = 6 @@ -586,13 +603,13 @@ def inference_modelscope( if output_path is not None: writer = DatadirWriter(output_path) ibest_writer = writer[f"1best_recog"] - + for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" - + vad_results = speech2vadsegment(**batch) fbanks, vadsegments = vad_results[0], vad_results[1] for i, segments in enumerate(vadsegments): @@ -606,19 +623,20 @@ def inference_modelscope( results = speech2text(**batch) if len(results) < 1: continue - + result_cur = [results[0][:-2]] if j == 0: result_segments = result_cur else: - result_segments = [[result_segments[0][i] + result_cur[0][i] for i in range(len(result_cur[0]))]] - + result_segments = [ + [result_segments[0][i] + result_cur[0][i] for i in range(len(result_cur[0]))]] + key = keys[0] result = result_segments[0] text, token, token_int = result[0], result[1], result[2] time_stamp = None if len(result) < 4 else result[3] - - if use_timestamp and time_stamp is not None: + + if use_timestamp and time_stamp is not None: postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp) else: postprocessed_result = postprocess_utils.sentence_postprocess(token) @@ -635,13 +653,13 @@ def inference_modelscope( text_postprocessed_punc = text_postprocessed if len(word_lists) > 0 and text2punc is not None: text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20) - + item = {'key': key, 'value': text_postprocessed_punc} if text_postprocessed != "": item['text_postprocessed'] = text_postprocessed if time_stamp_postprocessed != "": item['time_stamp'] = time_stamp_postprocessed - + asr_result_list.append(item) finish_count += 1 # asr_utils.print_progress(finish_count / file_count) @@ -654,11 +672,13 @@ def inference_modelscope( ibest_writer["text_with_punc"][key] = text_postprocessed_punc if time_stamp_postprocessed is not None: ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed) - + logging.info("decoding, utt: {}, predictions: {}".format(key, text_postprocessed_punc)) return asr_result_list + return _forward + def get_parser(): parser = config_argparse.ArgumentParser( description="ASR Decoding", diff --git a/funasr/bin/vad_inference.py b/funasr/bin/vad_inference.py index b0f8a77b3..607f131dd 100644 --- a/funasr/bin/vad_inference.py +++ b/funasr/bin/vad_inference.py @@ -107,10 +107,8 @@ class Speech2VadSegment: feats_len = feats_len.int() else: raise Exception("Need to extract feats first, please configure frontend configuration") - # batch = {"feats": feats, "waveform": speech, "is_final_send": True} - # segments = self.vad_model(**batch) - # b. Forward Encoder sreaming + # b. Forward Encoder streaming t_offset = 0 step = min(feats_len, 6000) segments = [[]] * self.batch_size diff --git a/vad_inference.py b/vad_inference.py deleted file mode 100644 index 607f131dd..000000000 --- a/vad_inference.py +++ /dev/null @@ -1,364 +0,0 @@ -import argparse -import logging -import sys -import json -from pathlib import Path -from typing import Any -from typing import List -from typing import Optional -from typing import Sequence -from typing import Tuple -from typing import Union -from typing import Dict - -import numpy as np -import torch -from typeguard import check_argument_types -from typeguard import check_return_type - -from funasr.fileio.datadir_writer import DatadirWriter -from funasr.modules.scorers.scorer_interface import BatchScorerInterface -from funasr.modules.subsampling import TooShortUttError -from funasr.tasks.vad import VADTask -from funasr.torch_utils.device_funcs import to_device -from funasr.torch_utils.set_all_random_seed import set_all_random_seed -from funasr.utils import config_argparse -from funasr.utils.cli_utils import get_commandline_args -from funasr.utils.types import str2bool -from funasr.utils.types import str2triple_str -from funasr.utils.types import str_or_none -from funasr.utils import asr_utils, wav_utils, postprocess_utils -from funasr.models.frontend.wav_frontend import WavFrontend - -header_colors = '\033[95m' -end_colors = '\033[0m' - -global_asr_language: str = 'zh-cn' -global_sample_rate: Union[int, Dict[Any, int]] = { - 'audio_fs': 16000, - 'model_fs': 16000 -} - - -class Speech2VadSegment: - """Speech2VadSegment class - - Examples: - >>> import soundfile - >>> speech2segment = Speech2VadSegment("vad_config.yml", "vad.pt") - >>> audio, rate = soundfile.read("speech.wav") - >>> speech2segment(audio) - [[10, 230], [245, 450], ...] - - """ - - def __init__( - self, - vad_infer_config: Union[Path, str] = None, - vad_model_file: Union[Path, str] = None, - vad_cmvn_file: Union[Path, str] = None, - device: str = "cpu", - batch_size: int = 1, - dtype: str = "float32", - **kwargs, - ): - assert check_argument_types() - - # 1. Build vad model - vad_model, vad_infer_args = VADTask.build_model_from_file( - vad_infer_config, vad_model_file, device - ) - frontend = None - if vad_infer_args.frontend is not None: - frontend = WavFrontend(cmvn_file=vad_cmvn_file, **vad_infer_args.frontend_conf) - - logging.info("vad_model: {}".format(vad_model)) - logging.info("vad_infer_args: {}".format(vad_infer_args)) - vad_model.to(dtype=getattr(torch, dtype)).eval() - - self.vad_model = vad_model - self.vad_infer_args = vad_infer_args - self.device = device - self.dtype = dtype - self.frontend = frontend - self.batch_size = batch_size - - @torch.no_grad() - def __call__( - self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None - ) -> List[List[int]]: - """Inference - - Args: - speech: Input speech data - Returns: - text, token, token_int, hyp - - """ - assert check_argument_types() - - # Input as audio signal - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - - if self.frontend is not None: - feats, feats_len = self.frontend.forward(speech, speech_lengths) - feats = to_device(feats, device=self.device) - feats_len = feats_len.int() - else: - raise Exception("Need to extract feats first, please configure frontend configuration") - - # b. Forward Encoder streaming - t_offset = 0 - step = min(feats_len, 6000) - segments = [[]] * self.batch_size - for t_offset in range(0, feats_len, min(step, feats_len - t_offset)): - if t_offset + step >= feats_len - 1: - step = feats_len - t_offset - is_final_send = True - else: - is_final_send = False - batch = { - "feats": feats[:, t_offset:t_offset + step, :], - "waveform": speech[:, t_offset * 160:min(speech.shape[-1], (t_offset + step - 1) * 160 + 400)], - "is_final_send": is_final_send - } - # a. To device - batch = to_device(batch, device=self.device) - segments_part = self.vad_model(**batch) - if segments_part: - for batch_num in range(0, self.batch_size): - segments[batch_num] += segments_part[batch_num] - return segments - - -def inference( - batch_size: int, - ngpu: int, - log_level: Union[int, str], - data_path_and_name_and_type, - vad_infer_config: Optional[str], - vad_model_file: Optional[str], - vad_cmvn_file: Optional[str] = None, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - key_file: Optional[str] = None, - allow_variable_data_keys: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - num_workers: int = 1, - **kwargs, -): - inference_pipeline = inference_modelscope( - batch_size=batch_size, - ngpu=ngpu, - log_level=log_level, - vad_infer_config=vad_infer_config, - vad_model_file=vad_model_file, - vad_cmvn_file=vad_cmvn_file, - key_file=key_file, - allow_variable_data_keys=allow_variable_data_keys, - output_dir=output_dir, - dtype=dtype, - seed=seed, - num_workers=num_workers, - **kwargs, - ) - return inference_pipeline(data_path_and_name_and_type, raw_inputs) - - -def inference_modelscope( - batch_size: int, - ngpu: int, - log_level: Union[int, str], - # data_path_and_name_and_type, - vad_infer_config: Optional[str], - vad_model_file: Optional[str], - vad_cmvn_file: Optional[str] = None, - # raw_inputs: Union[np.ndarray, torch.Tensor] = None, - key_file: Optional[str] = None, - allow_variable_data_keys: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - num_workers: int = 1, - **kwargs, -): - assert check_argument_types() - if batch_size > 1: - raise NotImplementedError("batch decoding is not implemented") - if ngpu > 1: - raise NotImplementedError("only single GPU decoding is supported") - - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - - # 1. Set random-seed - set_all_random_seed(seed) - - # 2. Build speech2vadsegment - speech2vadsegment_kwargs = dict( - vad_infer_config=vad_infer_config, - vad_model_file=vad_model_file, - vad_cmvn_file=vad_cmvn_file, - device=device, - dtype=dtype, - ) - logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs)) - speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs) - - def _forward( - data_path_and_name_and_type, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - output_dir_v2: Optional[str] = None, - fs: dict = None, - param_dict: dict = None, - ): - # 3. Build data-iterator - loader = VADTask.build_streaming_iterator( - data_path_and_name_and_type, - dtype=dtype, - batch_size=batch_size, - key_file=key_file, - num_workers=num_workers, - preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False), - collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False), - allow_variable_data_keys=allow_variable_data_keys, - inference=True, - ) - - finish_count = 0 - file_count = 1 - # 7 .Start for-loop - # FIXME(kamo): The output format should be discussed about - output_path = output_dir_v2 if output_dir_v2 is not None else output_dir - if output_path is not None: - writer = DatadirWriter(output_path) - ibest_writer = writer[f"1best_recog"] - else: - writer = None - ibest_writer = None - - vad_results = [] - for keys, batch in loader: - assert isinstance(batch, dict), type(batch) - assert all(isinstance(s, str) for s in keys), keys - _bs = len(next(iter(batch.values()))) - assert len(keys) == _bs, f"{len(keys)} != {_bs}" - - # do vad segment - results = speech2vadsegment(**batch) - for i, _ in enumerate(keys): - results[i] = json.dumps(results[i]) - item = {'key': keys[i], 'value': results[i]} - vad_results.append(item) - if writer is not None: - results[i] = json.loads(results[i]) - ibest_writer["text"][keys[i]] = "{}".format(results[i]) - - return vad_results - - return _forward - - -def get_parser(): - parser = config_argparse.ArgumentParser( - description="VAD Decoding", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Note(kamo): Use '_' instead of '-' as separator. - # '-' is confusing if written in yaml. - parser.add_argument( - "--log_level", - type=lambda x: x.upper(), - default="INFO", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), - help="The verbose level of logging", - ) - - parser.add_argument("--output_dir", type=str, required=False) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - parser.add_argument( - "--gpuid_list", - type=str, - default="", - help="The visible gpus", - ) - parser.add_argument("--seed", type=int, default=0, help="Random seed") - parser.add_argument( - "--dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type", - ) - parser.add_argument( - "--num_workers", - type=int, - default=1, - help="The number of workers used for DataLoader", - ) - - group = parser.add_argument_group("Input data related") - group.add_argument( - "--data_path_and_name_and_type", - type=str2triple_str, - required=False, - action="append", - ) - group.add_argument("--raw_inputs", type=list, default=None) - # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}]) - group.add_argument("--key_file", type=str_or_none) - group.add_argument("--allow_variable_data_keys", type=str2bool, default=False) - - group = parser.add_argument_group("The model configuration related") - group.add_argument( - "--vad_infer_config", - type=str, - help="VAD infer configuration", - ) - group.add_argument( - "--vad_model_file", - type=str, - help="VAD model parameter file", - ) - group.add_argument( - "--vad_cmvn_file", - type=str, - help="Global cmvn file", - ) - - group = parser.add_argument_group("infer related") - group.add_argument( - "--batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - - return parser - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - parser = get_parser() - args = parser.parse_args(cmd) - kwargs = vars(args) - kwargs.pop("config", None) - inference(**kwargs) - - -if __name__ == "__main__": - main()