diff --git a/funasr/bin/vad_inference.py b/funasr/bin/vad_inference.py index aff0a443b..1e19f5fc0 100644 --- a/funasr/bin/vad_inference.py +++ b/funasr/bin/vad_inference.py @@ -30,7 +30,7 @@ from funasr.utils.types import str2bool from funasr.utils.types import str2triple_str from funasr.utils.types import str_or_none from funasr.utils import asr_utils, wav_utils, postprocess_utils -from funasr.models.frontend.wav_frontend import WavFrontend +from funasr.models.frontend.wav_frontend import WavFrontend, WavFrontendOnline header_colors = '\033[95m' end_colors = '\033[0m' @@ -109,7 +109,7 @@ class Speech2VadSegment: fbanks, fbanks_len = self.frontend.forward_fbank(speech, speech_lengths) feats, feats_len = self.frontend.forward_lfr_cmvn(fbanks, fbanks_len) fbanks = to_device(fbanks, device=self.device) - feats = to_device(feats, device=self.device) + # feats = to_device(feats, device=self.device) feats_len = feats_len.int() else: raise Exception("Need to extract feats first, please configure frontend configuration") @@ -138,6 +138,69 @@ class Speech2VadSegment: segments[batch_num] += segments_part[batch_num] return fbanks, segments +class Speech2VadSegmentOnline(Speech2VadSegment): + """Speech2VadSegmentOnline class + + Examples: + >>> import soundfile + >>> speech2segment = Speech2VadSegmentOnline("vad_config.yml", "vad.pt") + >>> audio, rate = soundfile.read("speech.wav") + >>> speech2segment(audio) + [[10, 230], [245, 450], ...] + + """ + def __init__(self, **kwargs): + super(Speech2VadSegmentOnline, self).__init__(**kwargs) + vad_cmvn_file = kwargs.get('vad_cmvn_file', None) + self.frontend = None + if self.vad_infer_args.frontend is not None: + self.frontend = WavFrontendOnline(cmvn_file=vad_cmvn_file, **self.vad_infer_args.frontend_conf) + + + @torch.no_grad() + def __call__( + self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None, + in_cache: Dict[str, torch.Tensor] = dict(), is_final: bool = False, max_end_sil: int = 800 + ) -> Tuple[torch.Tensor, List[List[int]], torch.Tensor]: + """Inference + + Args: + speech: Input speech data + Returns: + text, token, token_int, hyp + + """ + assert check_argument_types() + + # Input as audio signal + if isinstance(speech, np.ndarray): + speech = torch.tensor(speech) + batch_size = speech.shape[0] + segments = [[]] * batch_size + if self.frontend is not None: + feats, feats_len = self.frontend.forward(speech, speech_lengths, is_final) + fbanks, _ = self.frontend.get_fbank() + else: + raise Exception("Need to extract feats first, please configure frontend configuration") + if feats.shape[0]: + feats = to_device(feats, device=self.device) + feats_len = feats_len.int() + waveforms = self.frontend.get_waveforms() + + batch = { + "feats": feats, + "waveform": waveforms, + "in_cache": in_cache, + "is_final": is_final, + "max_end_sil": max_end_sil + } + # a. To device + batch = to_device(batch, device=self.device) + segments, in_cache = self.vad_model.forward_online(**batch) + # in_cache.update(batch['in_cache']) + # in_cache = {key: value for key, value in batch['in_cache'].items()} + return fbanks, segments, in_cache + def inference( batch_size: int, @@ -154,26 +217,43 @@ def inference( dtype: str = "float32", seed: int = 0, num_workers: int = 1, + online: bool = False, **kwargs, ): - inference_pipeline = inference_modelscope( - batch_size=batch_size, - ngpu=ngpu, - log_level=log_level, - vad_infer_config=vad_infer_config, - vad_model_file=vad_model_file, - vad_cmvn_file=vad_cmvn_file, - key_file=key_file, - allow_variable_data_keys=allow_variable_data_keys, - output_dir=output_dir, - dtype=dtype, - seed=seed, - num_workers=num_workers, - **kwargs, - ) + if not online: + inference_pipeline = inference_modelscope( + batch_size=batch_size, + ngpu=ngpu, + log_level=log_level, + vad_infer_config=vad_infer_config, + vad_model_file=vad_model_file, + vad_cmvn_file=vad_cmvn_file, + key_file=key_file, + allow_variable_data_keys=allow_variable_data_keys, + output_dir=output_dir, + dtype=dtype, + seed=seed, + num_workers=num_workers, + **kwargs, + ) + else: + inference_pipeline = inference_modelscope_online( + batch_size=batch_size, + ngpu=ngpu, + log_level=log_level, + vad_infer_config=vad_infer_config, + vad_model_file=vad_model_file, + vad_cmvn_file=vad_cmvn_file, + key_file=key_file, + allow_variable_data_keys=allow_variable_data_keys, + output_dir=output_dir, + dtype=dtype, + seed=seed, + num_workers=num_workers, + **kwargs, + ) return inference_pipeline(data_path_and_name_and_type, raw_inputs) - def inference_modelscope( batch_size: int, ngpu: int, @@ -192,9 +272,6 @@ def inference_modelscope( **kwargs, ): assert check_argument_types() - ncpu = kwargs.get("ncpu", 1) - torch.set_num_threads(ncpu) - if batch_size > 1: raise NotImplementedError("batch decoding is not implemented") if ngpu > 1: @@ -282,6 +359,119 @@ def inference_modelscope( return _forward +def inference_modelscope_online( + batch_size: int, + ngpu: int, + log_level: Union[int, str], + # data_path_and_name_and_type, + vad_infer_config: Optional[str], + vad_model_file: Optional[str], + vad_cmvn_file: Optional[str] = None, + # raw_inputs: Union[np.ndarray, torch.Tensor] = None, + key_file: Optional[str] = None, + allow_variable_data_keys: bool = False, + output_dir: Optional[str] = None, + dtype: str = "float32", + seed: int = 0, + num_workers: int = 1, + **kwargs, +): + assert check_argument_types() + if batch_size > 1: + raise NotImplementedError("batch decoding is not implemented") + if ngpu > 1: + raise NotImplementedError("only single GPU decoding is supported") + + logging.basicConfig( + level=log_level, + format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", + ) + + if ngpu >= 1 and torch.cuda.is_available(): + device = "cuda" + else: + device = "cpu" + + # 1. Set random-seed + set_all_random_seed(seed) + + # 2. Build speech2vadsegment + speech2vadsegment_kwargs = dict( + vad_infer_config=vad_infer_config, + vad_model_file=vad_model_file, + vad_cmvn_file=vad_cmvn_file, + device=device, + dtype=dtype, + ) + logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs)) + speech2vadsegment = Speech2VadSegmentOnline(**speech2vadsegment_kwargs) + + def _forward( + data_path_and_name_and_type, + raw_inputs: Union[np.ndarray, torch.Tensor] = None, + output_dir_v2: Optional[str] = None, + fs: dict = None, + param_dict: dict = None, + ): + # 3. Build data-iterator + if data_path_and_name_and_type is None and raw_inputs is not None: + if isinstance(raw_inputs, torch.Tensor): + raw_inputs = raw_inputs.numpy() + data_path_and_name_and_type = [raw_inputs, "speech", "waveform"] + loader = VADTask.build_streaming_iterator( + data_path_and_name_and_type, + dtype=dtype, + batch_size=batch_size, + key_file=key_file, + num_workers=num_workers, + preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False), + collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False), + allow_variable_data_keys=allow_variable_data_keys, + inference=True, + ) + + finish_count = 0 + file_count = 1 + # 7 .Start for-loop + # FIXME(kamo): The output format should be discussed about + output_path = output_dir_v2 if output_dir_v2 is not None else output_dir + if output_path is not None: + writer = DatadirWriter(output_path) + ibest_writer = writer[f"1best_recog"] + else: + writer = None + ibest_writer = None + + vad_results = [] + batch_in_cache = param_dict['in_cache'] if param_dict is not None else dict() + is_final = param_dict.get('is_final', False) if param_dict is not None else False + max_end_sil = param_dict.get('max_end_sil', 800) if param_dict is not None else 800 + for keys, batch in loader: + assert isinstance(batch, dict), type(batch) + assert all(isinstance(s, str) for s in keys), keys + _bs = len(next(iter(batch.values()))) + assert len(keys) == _bs, f"{len(keys)} != {_bs}" + batch['in_cache'] = batch_in_cache + batch['is_final'] = is_final + batch['max_end_sil'] = max_end_sil + + # do vad segment + _, results, param_dict['in_cache'] = speech2vadsegment(**batch) + # param_dict['in_cache'] = batch['in_cache'] + if results: + for i, _ in enumerate(keys): + if results[i]: + if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas": + results[i] = json.dumps(results[i]) + item = {'key': keys[i], 'value': results[i]} + vad_results.append(item) + if writer is not None: + results[i] = json.loads(results[i]) + ibest_writer["text"][keys[i]] = "{}".format(results[i]) + + return vad_results + + return _forward def get_parser(): parser = config_argparse.ArgumentParser( @@ -354,6 +544,11 @@ def get_parser(): type=str, help="Global cmvn file", ) + group.add_argument( + "--online", + type=str, + help="decoding mode", + ) group = parser.add_argument_group("infer related") group.add_argument( @@ -377,3 +572,4 @@ def main(cmd=None): if __name__ == "__main__": main() + diff --git a/funasr/bin/vad_inference_launch.py b/funasr/bin/vad_inference_launch.py index 4a1f334cf..de589259f 100644 --- a/funasr/bin/vad_inference_launch.py +++ b/funasr/bin/vad_inference_launch.py @@ -1,4 +1,9 @@ #!/usr/bin/env python3 +# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved. +# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0) + +import torch +torch.set_num_threads(1) import argparse import logging @@ -109,8 +114,8 @@ def inference_launch(mode, **kwargs): from funasr.bin.vad_inference import inference_modelscope return inference_modelscope(**kwargs) elif mode == "online": - from funasr.bin.vad_inference_online import inference_modelscope - return inference_modelscope(**kwargs) + from funasr.bin.vad_inference import inference_modelscope_online + return inference_modelscope_online(**kwargs) else: logging.info("Unknown decoding mode: {}".format(mode)) return None diff --git a/funasr/models/e2e_vad.py b/funasr/models/e2e_vad.py index 50ec47515..d72c63549 100644 --- a/funasr/models/e2e_vad.py +++ b/funasr/models/e2e_vad.py @@ -311,7 +311,7 @@ class E2EVadModel(nn.Module): 0.000001)) def ComputeScores(self, feats: torch.Tensor, in_cache: Dict[str, torch.Tensor]) -> None: - scores = self.encoder(feats, in_cache) # return B * T * D + scores = self.encoder(feats, in_cache).to('cpu') # return B * T * D assert scores.shape[1] == feats.shape[1], "The shape between feats and scores does not match" self.vad_opts.nn_eval_block_size = scores.shape[1] self.frm_cnt += scores.shape[1] # count total frames diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py index 203f00e04..3661f6b11 100644 --- a/funasr/models/frontend/wav_frontend.py +++ b/funasr/models/frontend/wav_frontend.py @@ -34,7 +34,7 @@ def load_cmvn(cmvn_file): means = np.array(means_list).astype(np.float) vars = np.array(vars_list).astype(np.float) cmvn = np.array([means, vars]) - cmvn = torch.as_tensor(cmvn) + cmvn = torch.as_tensor(cmvn, dype=torch.float32) return cmvn @@ -47,10 +47,10 @@ def apply_cmvn(inputs, cmvn): # noqa dtype = inputs.dtype frame, dim = inputs.shape - means = np.tile(cmvn[0:1, :dim], (frame, 1)) - vars = np.tile(cmvn[1:2, :dim], (frame, 1)) - inputs += torch.from_numpy(means).type(dtype).to(device) - inputs *= torch.from_numpy(vars).type(dtype).to(device) + means = cmvn[0:1, :dim] + vars = cmvn[1:2, :dim] + inputs += means.to(device) + inputs *= vars.to(device) return inputs.type(torch.float32) diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx/utils/frontend.py b/funasr/runtime/python/onnxruntime/funasr_onnx/utils/frontend.py index c92db4e55..5478236db 100644 --- a/funasr/runtime/python/onnxruntime/funasr_onnx/utils/frontend.py +++ b/funasr/runtime/python/onnxruntime/funasr_onnx/utils/frontend.py @@ -217,7 +217,7 @@ class WavFrontendOnline(WavFrontend): frame_num = self.compute_frame_num(input.shape[-1], self.frame_sample_length, self.frame_shift_sample_length) # update self.in_cache self.input_cache = input[:, -(input.shape[-1] - frame_num * self.frame_shift_sample_length):] - waveforms = np.empty(0, dtype=np.int16) + waveforms = np.empty(0, dtype=np.float32) feats_pad = np.empty(0, dtype=np.float32) feats_lens = np.empty(0, dtype=np.int32) if frame_num: @@ -237,7 +237,7 @@ class WavFrontendOnline(WavFrontend): mat[i, :] = self.fbank_fn.get_frame(i) feat = mat.astype(np.float32) feat_len = np.array(mat.shape[0]).astype(np.int32) - feats.append(mat) + feats.append(feat) feats_lens.append(feat_len) waveforms = np.stack(waveforms) diff --git a/tests/test_vad_inference_pipeline.py b/tests/test_vad_inference_pipeline.py index d22f46178..3a3f0f6d3 100644 --- a/tests/test_vad_inference_pipeline.py +++ b/tests/test_vad_inference_pipeline.py @@ -20,6 +20,13 @@ class TestFSMNInferencePipelines(unittest.TestCase): rec_result = inference_pipeline( audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example_8k.wav') logger.info("vad inference result: {0}".format(rec_result)) + assert rec_result[ + "text"] == "[[0, 1960], [2870, 6730], [7960, 10180], [12140, 14830], [15740, 19400], " \ + "[20220, 24230], [25540, 27290], [30070, 30970], [32070, 34280], [35990, 37050], " \ + "[39400, 41020], [41810, 47320], [48120, 52150], [53560, 58310], [59290, 62210], " \ + "[63110, 66420], [67300, 68280], [69670, 71770], [73100, 75550], [76850, 78500], " \ + "[79380, 83280], [85000, 92320], [93560, 94110], [94990, 95620], [96940, 97590], " \ + "[98400, 100530], [101600, 104890], [108780, 110900], [112020, 113460], [114210, 115030]]" def test_16k(self): inference_pipeline = pipeline( @@ -29,6 +36,10 @@ class TestFSMNInferencePipelines(unittest.TestCase): rec_result = inference_pipeline( audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav') logger.info("vad inference result: {0}".format(rec_result)) + assert rec_result[ + "text"] == "[[70, 2340], [2620, 6200], [6480, 23670], [23950, 26250], [26780, 28990], " \ + "[29950, 31430], [31750, 37600], [38210, 46900], [47310, 49630], [49910, 56460], " \ + "[56740, 59540], [59820, 70450]" if __name__ == '__main__':