diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py index c4bb61bd1..ee361351a 100644 --- a/funasr/bin/asr_inference_paraformer_vad_punc.py +++ b/funasr/bin/asr_inference_paraformer_vad_punc.py @@ -144,7 +144,7 @@ class Speech2Text: for scorer in scorers.values(): if isinstance(scorer, torch.nn.Module): scorer.to(device=device, dtype=getattr(torch, dtype)).eval() - + logging.info(f"Decoding device={device}, dtype={dtype}") # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text @@ -184,12 +184,11 @@ class Speech2Text: self.encoder_downsampling_factor = 1 if asr_train_args.encoder_conf["input_layer"] == "conv2d": self.encoder_downsampling_factor = 4 - - @torch.no_grad() def __call__( - self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None, begin_time: int = 0, end_time: int = None, + self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None, + begin_time: int = 0, end_time: int = None, ): """Inference @@ -215,7 +214,7 @@ class Speech2Text: else: feats = speech feats_len = speech_lengths - lfr_factor = max(1, (feats.size()[-1]//80)-1) + lfr_factor = max(1, (feats.size()[-1] // 80) - 1) batch = {"speech": feats, "speech_lengths": feats_len} # a. To device @@ -229,7 +228,8 @@ class Speech2Text: enc_len_batch_total = torch.sum(enc_len).item() * self.encoder_downsampling_factor predictor_outs = self.asr_model.calc_predictor(enc, enc_len) - pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], predictor_outs[2], predictor_outs[3] + pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \ + predictor_outs[2], predictor_outs[3] pre_token_length = pre_token_length.round().long() if torch.max(pre_token_length) < 1: return [] @@ -249,7 +249,7 @@ class Speech2Text: nbest_hyps = self.beam_search( x=x, am_scores=am_scores, maxlenratio=self.maxlenratio, minlenratio=self.minlenratio ) - + nbest_hyps = nbest_hyps[: self.nbest] else: yseq = am_scores.argmax(dim=-1) @@ -260,23 +260,23 @@ class Speech2Text: [self.asr_model.sos] + yseq.tolist() + [self.asr_model.eos], device=yseq.device ) nbest_hyps = [Hypothesis(yseq=yseq, score=score)] - + for hyp in nbest_hyps: assert isinstance(hyp, (Hypothesis)), type(hyp) - + # remove sos/eos and get results last_pos = -1 if isinstance(hyp.yseq, list): token_int = hyp.yseq[1:last_pos] else: token_int = hyp.yseq[1:last_pos].tolist() - + # remove blank symbol id, which is assumed to be 0 token_int = list(filter(lambda x: x != 0 and x != 2, token_int)) - + # Change integer-ids to tokens token = self.converter.ids2tokens(token_int) - + if self.tokenizer is not None: text = self.tokenizer.tokens2text(token) else: @@ -286,12 +286,14 @@ class Speech2Text: timestamp = time_stamp_lfr6_pl(us_alphas[i], us_cif_peak[i], copy.copy(token), begin_time, end_time) results.append((text, token, token_int, timestamp, enc_len_batch_total, lfr_factor)) else: - time_stamp = time_stamp_lfr6(alphas[i:i + 1, ], enc_len[i:i + 1, ], copy.copy(token), begin_time, end_time) + time_stamp = time_stamp_lfr6(alphas[i:i + 1, ], enc_len[i:i + 1, ], copy.copy(token), begin_time, + end_time) results.append((text, token, token_int, time_stamp, enc_len_batch_total, lfr_factor)) # assert check_return_type(results) return results + class Speech2VadSegment: """Speech2VadSegment class @@ -333,6 +335,7 @@ class Speech2VadSegment: self.device = device self.dtype = dtype self.frontend = frontend + self.batch_size = batch_size @torch.no_grad() def __call__( @@ -361,56 +364,69 @@ class Speech2VadSegment: feats_len = feats_len.int() else: raise Exception("Need to extract feats first, please configure frontend configuration") - batch = {"feats": feats, "feats_lengths": feats_len, "waveform": speech} - # a. To device - batch = to_device(batch, device=self.device) - - # b. Forward Encoder - segments = self.vad_model(**batch) + # b. Forward Encoder streaming + t_offset = 0 + step = min(feats_len, 6000) + segments = [[]] * self.batch_size + for t_offset in range(0, feats_len, min(step, feats_len - t_offset)): + if t_offset + step >= feats_len - 1: + step = feats_len - t_offset + is_final_send = True + else: + is_final_send = False + batch = { + "feats": feats[:, t_offset:t_offset + step, :], + "waveform": speech[:, t_offset * 160:min(speech.shape[-1], (t_offset + step - 1) * 160 + 400)], + "is_final_send": is_final_send + } + # a. To device + batch = to_device(batch, device=self.device) + segments_part = self.vad_model(**batch) + if segments_part: + for batch_num in range(0, self.batch_size): + segments[batch_num] += segments_part[batch_num] return fbanks, segments - def inference( - maxlenratio: float, - minlenratio: float, - batch_size: int, - beam_size: int, - ngpu: int, - ctc_weight: float, - lm_weight: float, - penalty: float, - log_level: Union[int, str], - data_path_and_name_and_type, - asr_train_config: Optional[str], - asr_model_file: Optional[str], - cmvn_file: Optional[str] = None, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - lm_train_config: Optional[str] = None, - lm_file: Optional[str] = None, - token_type: Optional[str] = None, - key_file: Optional[str] = None, - word_lm_train_config: Optional[str] = None, - bpemodel: Optional[str] = None, - allow_variable_data_keys: bool = False, - streaming: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - ngram_weight: float = 0.9, - nbest: int = 1, - num_workers: int = 1, - vad_infer_config: Optional[str] = None, - vad_model_file: Optional[str] = None, - vad_cmvn_file: Optional[str] = None, - time_stamp_writer: bool = False, - punc_infer_config: Optional[str] = None, - punc_model_file: Optional[str] = None, - **kwargs, + maxlenratio: float, + minlenratio: float, + batch_size: int, + beam_size: int, + ngpu: int, + ctc_weight: float, + lm_weight: float, + penalty: float, + log_level: Union[int, str], + data_path_and_name_and_type, + asr_train_config: Optional[str], + asr_model_file: Optional[str], + cmvn_file: Optional[str] = None, + raw_inputs: Union[np.ndarray, torch.Tensor] = None, + lm_train_config: Optional[str] = None, + lm_file: Optional[str] = None, + token_type: Optional[str] = None, + key_file: Optional[str] = None, + word_lm_train_config: Optional[str] = None, + bpemodel: Optional[str] = None, + allow_variable_data_keys: bool = False, + streaming: bool = False, + output_dir: Optional[str] = None, + dtype: str = "float32", + seed: int = 0, + ngram_weight: float = 0.9, + nbest: int = 1, + num_workers: int = 1, + vad_infer_config: Optional[str] = None, + vad_model_file: Optional[str] = None, + vad_cmvn_file: Optional[str] = None, + time_stamp_writer: bool = False, + punc_infer_config: Optional[str] = None, + punc_model_file: Optional[str] = None, + **kwargs, ): - inference_pipeline = inference_modelscope( maxlenratio=maxlenratio, minlenratio=minlenratio, @@ -449,63 +465,64 @@ def inference( ) return inference_pipeline(data_path_and_name_and_type, raw_inputs) + def inference_modelscope( - maxlenratio: float, - minlenratio: float, - batch_size: int, - beam_size: int, - ngpu: int, - ctc_weight: float, - lm_weight: float, - penalty: float, - log_level: Union[int, str], - # data_path_and_name_and_type, - asr_train_config: Optional[str], - asr_model_file: Optional[str], - cmvn_file: Optional[str] = None, - lm_train_config: Optional[str] = None, - lm_file: Optional[str] = None, - token_type: Optional[str] = None, - key_file: Optional[str] = None, - word_lm_train_config: Optional[str] = None, - bpemodel: Optional[str] = None, - allow_variable_data_keys: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - ngram_weight: float = 0.9, - nbest: int = 1, - num_workers: int = 1, - vad_infer_config: Optional[str] = None, - vad_model_file: Optional[str] = None, - vad_cmvn_file: Optional[str] = None, - time_stamp_writer: bool = True, - punc_infer_config: Optional[str] = None, - punc_model_file: Optional[str] = None, - outputs_dict: Optional[bool] = True, - param_dict: dict = None, - **kwargs, + maxlenratio: float, + minlenratio: float, + batch_size: int, + beam_size: int, + ngpu: int, + ctc_weight: float, + lm_weight: float, + penalty: float, + log_level: Union[int, str], + # data_path_and_name_and_type, + asr_train_config: Optional[str], + asr_model_file: Optional[str], + cmvn_file: Optional[str] = None, + lm_train_config: Optional[str] = None, + lm_file: Optional[str] = None, + token_type: Optional[str] = None, + key_file: Optional[str] = None, + word_lm_train_config: Optional[str] = None, + bpemodel: Optional[str] = None, + allow_variable_data_keys: bool = False, + output_dir: Optional[str] = None, + dtype: str = "float32", + seed: int = 0, + ngram_weight: float = 0.9, + nbest: int = 1, + num_workers: int = 1, + vad_infer_config: Optional[str] = None, + vad_model_file: Optional[str] = None, + vad_cmvn_file: Optional[str] = None, + time_stamp_writer: bool = True, + punc_infer_config: Optional[str] = None, + punc_model_file: Optional[str] = None, + outputs_dict: Optional[bool] = True, + param_dict: dict = None, + **kwargs, ): assert check_argument_types() - + if word_lm_train_config is not None: raise NotImplementedError("Word LM is not implemented") if ngpu > 1: raise NotImplementedError("only single GPU decoding is supported") - + logging.basicConfig( level=log_level, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) - + if ngpu >= 1 and torch.cuda.is_available(): device = "cuda" else: device = "cpu" - + # 1. Set random-seed set_all_random_seed(seed) - + # 2. Build speech2vadsegment speech2vadsegment_kwargs = dict( vad_infer_config=vad_infer_config, @@ -516,7 +533,7 @@ def inference_modelscope( ) # logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs)) speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs) - + # 3. Build speech2text speech2text_kwargs = dict( asr_train_config=asr_train_config, @@ -539,14 +556,14 @@ def inference_modelscope( ) speech2text = Speech2Text(**speech2text_kwargs) text2punc = None - if punc_model_file is not None: + if punc_model_file is not None: text2punc = Text2Punc(punc_infer_config, punc_model_file, device=device, dtype=dtype) if output_dir is not None: writer = DatadirWriter(output_dir) ibest_writer = writer[f"1best_recog"] ibest_writer["token_list"][""] = " ".join(speech2text.asr_train_args.token_list) - + def _forward(data_path_and_name_and_type, raw_inputs: Union[np.ndarray, torch.Tensor] = None, output_dir_v2: Optional[str] = None, @@ -575,7 +592,7 @@ def inference_modelscope( use_timestamp = param_dict.get('use_timestamp', True) else: use_timestamp = True - + finish_count = 0 file_count = 1 lfr_factor = 6 @@ -586,13 +603,13 @@ def inference_modelscope( if output_path is not None: writer = DatadirWriter(output_path) ibest_writer = writer[f"1best_recog"] - + for keys, batch in loader: assert isinstance(batch, dict), type(batch) assert all(isinstance(s, str) for s in keys), keys _bs = len(next(iter(batch.values()))) assert len(keys) == _bs, f"{len(keys)} != {_bs}" - + vad_results = speech2vadsegment(**batch) fbanks, vadsegments = vad_results[0], vad_results[1] for i, segments in enumerate(vadsegments): @@ -606,19 +623,20 @@ def inference_modelscope( results = speech2text(**batch) if len(results) < 1: continue - + result_cur = [results[0][:-2]] if j == 0: result_segments = result_cur else: - result_segments = [[result_segments[0][i] + result_cur[0][i] for i in range(len(result_cur[0]))]] - + result_segments = [ + [result_segments[0][i] + result_cur[0][i] for i in range(len(result_cur[0]))]] + key = keys[0] result = result_segments[0] text, token, token_int = result[0], result[1], result[2] time_stamp = None if len(result) < 4 else result[3] - - if use_timestamp and time_stamp is not None: + + if use_timestamp and time_stamp is not None: postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp) else: postprocessed_result = postprocess_utils.sentence_postprocess(token) @@ -635,13 +653,13 @@ def inference_modelscope( text_postprocessed_punc = text_postprocessed if len(word_lists) > 0 and text2punc is not None: text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20) - + item = {'key': key, 'value': text_postprocessed_punc} if text_postprocessed != "": item['text_postprocessed'] = text_postprocessed if time_stamp_postprocessed != "": item['time_stamp'] = time_stamp_postprocessed - + asr_result_list.append(item) finish_count += 1 # asr_utils.print_progress(finish_count / file_count) @@ -654,11 +672,13 @@ def inference_modelscope( ibest_writer["text_with_punc"][key] = text_postprocessed_punc if time_stamp_postprocessed is not None: ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed) - + logging.info("decoding, utt: {}, predictions: {}".format(key, text_postprocessed_punc)) return asr_result_list + return _forward + def get_parser(): parser = config_argparse.ArgumentParser( description="ASR Decoding", diff --git a/funasr/bin/vad_inference.py b/funasr/bin/vad_inference.py index b0f8a77b3..607f131dd 100644 --- a/funasr/bin/vad_inference.py +++ b/funasr/bin/vad_inference.py @@ -107,10 +107,8 @@ class Speech2VadSegment: feats_len = feats_len.int() else: raise Exception("Need to extract feats first, please configure frontend configuration") - # batch = {"feats": feats, "waveform": speech, "is_final_send": True} - # segments = self.vad_model(**batch) - # b. Forward Encoder sreaming + # b. Forward Encoder streaming t_offset = 0 step = min(feats_len, 6000) segments = [[]] * self.batch_size diff --git a/vad_inference.py b/vad_inference.py deleted file mode 100644 index 607f131dd..000000000 --- a/vad_inference.py +++ /dev/null @@ -1,364 +0,0 @@ -import argparse -import logging -import sys -import json -from pathlib import Path -from typing import Any -from typing import List -from typing import Optional -from typing import Sequence -from typing import Tuple -from typing import Union -from typing import Dict - -import numpy as np -import torch -from typeguard import check_argument_types -from typeguard import check_return_type - -from funasr.fileio.datadir_writer import DatadirWriter -from funasr.modules.scorers.scorer_interface import BatchScorerInterface -from funasr.modules.subsampling import TooShortUttError -from funasr.tasks.vad import VADTask -from funasr.torch_utils.device_funcs import to_device -from funasr.torch_utils.set_all_random_seed import set_all_random_seed -from funasr.utils import config_argparse -from funasr.utils.cli_utils import get_commandline_args -from funasr.utils.types import str2bool -from funasr.utils.types import str2triple_str -from funasr.utils.types import str_or_none -from funasr.utils import asr_utils, wav_utils, postprocess_utils -from funasr.models.frontend.wav_frontend import WavFrontend - -header_colors = '\033[95m' -end_colors = '\033[0m' - -global_asr_language: str = 'zh-cn' -global_sample_rate: Union[int, Dict[Any, int]] = { - 'audio_fs': 16000, - 'model_fs': 16000 -} - - -class Speech2VadSegment: - """Speech2VadSegment class - - Examples: - >>> import soundfile - >>> speech2segment = Speech2VadSegment("vad_config.yml", "vad.pt") - >>> audio, rate = soundfile.read("speech.wav") - >>> speech2segment(audio) - [[10, 230], [245, 450], ...] - - """ - - def __init__( - self, - vad_infer_config: Union[Path, str] = None, - vad_model_file: Union[Path, str] = None, - vad_cmvn_file: Union[Path, str] = None, - device: str = "cpu", - batch_size: int = 1, - dtype: str = "float32", - **kwargs, - ): - assert check_argument_types() - - # 1. Build vad model - vad_model, vad_infer_args = VADTask.build_model_from_file( - vad_infer_config, vad_model_file, device - ) - frontend = None - if vad_infer_args.frontend is not None: - frontend = WavFrontend(cmvn_file=vad_cmvn_file, **vad_infer_args.frontend_conf) - - logging.info("vad_model: {}".format(vad_model)) - logging.info("vad_infer_args: {}".format(vad_infer_args)) - vad_model.to(dtype=getattr(torch, dtype)).eval() - - self.vad_model = vad_model - self.vad_infer_args = vad_infer_args - self.device = device - self.dtype = dtype - self.frontend = frontend - self.batch_size = batch_size - - @torch.no_grad() - def __call__( - self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None - ) -> List[List[int]]: - """Inference - - Args: - speech: Input speech data - Returns: - text, token, token_int, hyp - - """ - assert check_argument_types() - - # Input as audio signal - if isinstance(speech, np.ndarray): - speech = torch.tensor(speech) - - if self.frontend is not None: - feats, feats_len = self.frontend.forward(speech, speech_lengths) - feats = to_device(feats, device=self.device) - feats_len = feats_len.int() - else: - raise Exception("Need to extract feats first, please configure frontend configuration") - - # b. Forward Encoder streaming - t_offset = 0 - step = min(feats_len, 6000) - segments = [[]] * self.batch_size - for t_offset in range(0, feats_len, min(step, feats_len - t_offset)): - if t_offset + step >= feats_len - 1: - step = feats_len - t_offset - is_final_send = True - else: - is_final_send = False - batch = { - "feats": feats[:, t_offset:t_offset + step, :], - "waveform": speech[:, t_offset * 160:min(speech.shape[-1], (t_offset + step - 1) * 160 + 400)], - "is_final_send": is_final_send - } - # a. To device - batch = to_device(batch, device=self.device) - segments_part = self.vad_model(**batch) - if segments_part: - for batch_num in range(0, self.batch_size): - segments[batch_num] += segments_part[batch_num] - return segments - - -def inference( - batch_size: int, - ngpu: int, - log_level: Union[int, str], - data_path_and_name_and_type, - vad_infer_config: Optional[str], - vad_model_file: Optional[str], - vad_cmvn_file: Optional[str] = None, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - key_file: Optional[str] = None, - allow_variable_data_keys: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - num_workers: int = 1, - **kwargs, -): - inference_pipeline = inference_modelscope( - batch_size=batch_size, - ngpu=ngpu, - log_level=log_level, - vad_infer_config=vad_infer_config, - vad_model_file=vad_model_file, - vad_cmvn_file=vad_cmvn_file, - key_file=key_file, - allow_variable_data_keys=allow_variable_data_keys, - output_dir=output_dir, - dtype=dtype, - seed=seed, - num_workers=num_workers, - **kwargs, - ) - return inference_pipeline(data_path_and_name_and_type, raw_inputs) - - -def inference_modelscope( - batch_size: int, - ngpu: int, - log_level: Union[int, str], - # data_path_and_name_and_type, - vad_infer_config: Optional[str], - vad_model_file: Optional[str], - vad_cmvn_file: Optional[str] = None, - # raw_inputs: Union[np.ndarray, torch.Tensor] = None, - key_file: Optional[str] = None, - allow_variable_data_keys: bool = False, - output_dir: Optional[str] = None, - dtype: str = "float32", - seed: int = 0, - num_workers: int = 1, - **kwargs, -): - assert check_argument_types() - if batch_size > 1: - raise NotImplementedError("batch decoding is not implemented") - if ngpu > 1: - raise NotImplementedError("only single GPU decoding is supported") - - logging.basicConfig( - level=log_level, - format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", - ) - - if ngpu >= 1 and torch.cuda.is_available(): - device = "cuda" - else: - device = "cpu" - - # 1. Set random-seed - set_all_random_seed(seed) - - # 2. Build speech2vadsegment - speech2vadsegment_kwargs = dict( - vad_infer_config=vad_infer_config, - vad_model_file=vad_model_file, - vad_cmvn_file=vad_cmvn_file, - device=device, - dtype=dtype, - ) - logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs)) - speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs) - - def _forward( - data_path_and_name_and_type, - raw_inputs: Union[np.ndarray, torch.Tensor] = None, - output_dir_v2: Optional[str] = None, - fs: dict = None, - param_dict: dict = None, - ): - # 3. Build data-iterator - loader = VADTask.build_streaming_iterator( - data_path_and_name_and_type, - dtype=dtype, - batch_size=batch_size, - key_file=key_file, - num_workers=num_workers, - preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False), - collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False), - allow_variable_data_keys=allow_variable_data_keys, - inference=True, - ) - - finish_count = 0 - file_count = 1 - # 7 .Start for-loop - # FIXME(kamo): The output format should be discussed about - output_path = output_dir_v2 if output_dir_v2 is not None else output_dir - if output_path is not None: - writer = DatadirWriter(output_path) - ibest_writer = writer[f"1best_recog"] - else: - writer = None - ibest_writer = None - - vad_results = [] - for keys, batch in loader: - assert isinstance(batch, dict), type(batch) - assert all(isinstance(s, str) for s in keys), keys - _bs = len(next(iter(batch.values()))) - assert len(keys) == _bs, f"{len(keys)} != {_bs}" - - # do vad segment - results = speech2vadsegment(**batch) - for i, _ in enumerate(keys): - results[i] = json.dumps(results[i]) - item = {'key': keys[i], 'value': results[i]} - vad_results.append(item) - if writer is not None: - results[i] = json.loads(results[i]) - ibest_writer["text"][keys[i]] = "{}".format(results[i]) - - return vad_results - - return _forward - - -def get_parser(): - parser = config_argparse.ArgumentParser( - description="VAD Decoding", - formatter_class=argparse.ArgumentDefaultsHelpFormatter, - ) - - # Note(kamo): Use '_' instead of '-' as separator. - # '-' is confusing if written in yaml. - parser.add_argument( - "--log_level", - type=lambda x: x.upper(), - default="INFO", - choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"), - help="The verbose level of logging", - ) - - parser.add_argument("--output_dir", type=str, required=False) - parser.add_argument( - "--ngpu", - type=int, - default=0, - help="The number of gpus. 0 indicates CPU mode", - ) - parser.add_argument( - "--gpuid_list", - type=str, - default="", - help="The visible gpus", - ) - parser.add_argument("--seed", type=int, default=0, help="Random seed") - parser.add_argument( - "--dtype", - default="float32", - choices=["float16", "float32", "float64"], - help="Data type", - ) - parser.add_argument( - "--num_workers", - type=int, - default=1, - help="The number of workers used for DataLoader", - ) - - group = parser.add_argument_group("Input data related") - group.add_argument( - "--data_path_and_name_and_type", - type=str2triple_str, - required=False, - action="append", - ) - group.add_argument("--raw_inputs", type=list, default=None) - # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}]) - group.add_argument("--key_file", type=str_or_none) - group.add_argument("--allow_variable_data_keys", type=str2bool, default=False) - - group = parser.add_argument_group("The model configuration related") - group.add_argument( - "--vad_infer_config", - type=str, - help="VAD infer configuration", - ) - group.add_argument( - "--vad_model_file", - type=str, - help="VAD model parameter file", - ) - group.add_argument( - "--vad_cmvn_file", - type=str, - help="Global cmvn file", - ) - - group = parser.add_argument_group("infer related") - group.add_argument( - "--batch_size", - type=int, - default=1, - help="The batch size for inference", - ) - - return parser - - -def main(cmd=None): - print(get_commandline_args(), file=sys.stderr) - parser = get_parser() - args = parser.parse_args(cmd) - kwargs = vars(args) - kwargs.pop("config", None) - inference(**kwargs) - - -if __name__ == "__main__": - main()