diff --git a/funasr/bin/vad_inference_launch.py b/funasr/bin/vad_inference_launch.py index 42c5c1e12..18eba33fb 100644 --- a/funasr/bin/vad_inference_launch.py +++ b/funasr/bin/vad_inference_launch.py @@ -110,7 +110,8 @@ def inference_launch(mode, **kwargs): if mode == "offline": from funasr.bin.vad_inference import inference_modelscope return inference_modelscope(**kwargs) - elif mode == "online": + # elif mode == "online": + if "param_dict" in kwargs and kwargs["param_dict"]["online"]: from funasr.bin.vad_inference_online import inference_modelscope return inference_modelscope(**kwargs) else: diff --git a/funasr/export/README.md b/funasr/export/README.md index c44ad3382..33ab22ea9 100644 --- a/funasr/export/README.md +++ b/funasr/export/README.md @@ -11,31 +11,43 @@ The installation is the same as [funasr](../../README.md) `Tips`: torch>=1.11.0 ```shell - python -m funasr.export.export_model [model_name] [export_dir] [onnx] + python -m funasr.export.export_model \ + --model-name [model_name] \ + --export-dir [export_dir] \ + --type [onnx, torch] \ + --quantize \ + --fallback-num [fallback_num] ``` - `model_name`: the model is to export. It could be the models from modelscope, or local finetuned model(named: model.pb). - `export_dir`: the dir where the onnx is export. - `onnx`: `true`, export onnx format model; `false`, export torchscripts format model. + `model-name`: the model is to export. It could be the models from modelscope, or local finetuned model(named: model.pb). + + `export-dir`: the dir where the onnx is export. + + `type`: `onnx` or `torch`, export onnx format model or torchscript format model. + + `quantize`: `true`, export quantized model at the same time; `false`, export fp32 model only. + + `fallback-num`: specify the number of fallback layers to perform automatic mixed precision quantization. + ## For example ### Export onnx format model Export model from modelscope ```shell -python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true +python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx ``` Export model from local path, the model'name must be `model.pb`. ```shell -python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true +python -m funasr.export.export_model --model-name /mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx ``` ### Export torchscripts format model Export model from modelscope ```shell -python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" false +python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type torch ``` Export model from local path, the model'name must be `model.pb`. ```shell -python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" false +python -m funasr.export.export_model --model-name /mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type torch ``` diff --git a/funasr/export/export_model.py b/funasr/export/export_model.py index 3cbf6d293..9a1ef9604 100644 --- a/funasr/export/export_model.py +++ b/funasr/export/export_model.py @@ -15,7 +15,15 @@ import random # assert torch_version > 1.9 class ASRModelExportParaformer: - def __init__(self, cache_dir: Union[Path, str] = None, onnx: bool = True): + def __init__( + self, + cache_dir: Union[Path, str] = None, + onnx: bool = True, + quant: bool = True, + fallback_num: int = 0, + audio_in: str = None, + calib_num: int = 200, + ): assert check_argument_types() self.set_all_random_seed(0) if cache_dir is None: @@ -28,6 +36,11 @@ class ASRModelExportParaformer: ) print("output dir: {}".format(self.cache_dir)) self.onnx = onnx + self.quant = quant + self.fallback_num = fallback_num + self.frontend = None + self.audio_in = audio_in + self.calib_num = calib_num def _export( @@ -56,6 +69,43 @@ class ASRModelExportParaformer: print("output dir: {}".format(export_dir)) + def _torch_quantize(self, model): + def _run_calibration_data(m): + # using dummy inputs for a example + if self.audio_in is not None: + feats, feats_len = self.load_feats(self.audio_in) + for i, (feat, len) in enumerate(zip(feats, feats_len)): + with torch.no_grad(): + m(feat, len) + else: + dummy_input = model.get_dummy_inputs() + m(*dummy_input) + + + from torch_quant.module import ModuleFilter + from torch_quant.quantizer import Backend, Quantizer + from funasr.export.models.modules.decoder_layer import DecoderLayerSANM + from funasr.export.models.modules.encoder_layer import EncoderLayerSANM + module_filter = ModuleFilter(include_classes=[EncoderLayerSANM, DecoderLayerSANM]) + module_filter.exclude_op_types = [torch.nn.Conv1d] + quantizer = Quantizer( + module_filter=module_filter, + backend=Backend.FBGEMM, + ) + model.eval() + calib_model = quantizer.calib(model) + _run_calibration_data(calib_model) + if self.fallback_num > 0: + # perform automatic mixed precision quantization + amp_model = quantizer.amp(model) + _run_calibration_data(amp_model) + quantizer.fallback(amp_model, num=self.fallback_num) + print('Fallback layers:') + print('\n'.join(quantizer.module_filter.exclude_names)) + quant_model = quantizer.quantize(model) + return quant_model + + def _export_torchscripts(self, model, verbose, path, enc_size=None): if enc_size: dummy_input = model.get_dummy_inputs(enc_size) @@ -66,10 +116,49 @@ class ASRModelExportParaformer: model_script = torch.jit.trace(model, dummy_input) model_script.save(os.path.join(path, f'{model.model_name}.torchscripts')) + if self.quant: + quant_model = self._torch_quantize(model) + model_script = torch.jit.trace(quant_model, dummy_input) + model_script.save(os.path.join(path, f'{model.model_name}_quant.torchscripts')) + + def set_all_random_seed(self, seed: int): random.seed(seed) np.random.seed(seed) torch.random.manual_seed(seed) + + def parse_audio_in(self, audio_in): + + wav_list, name_list = [], [] + if audio_in.endswith(".scp"): + f = open(audio_in, 'r') + lines = f.readlines()[:self.calib_num] + for line in lines: + name, path = line.strip().split() + name_list.append(name) + wav_list.append(path) + else: + wav_list = [audio_in,] + name_list = ["test",] + return wav_list, name_list + + def load_feats(self, audio_in: str = None): + import torchaudio + + wav_list, name_list = self.parse_audio_in(audio_in) + feats = [] + feats_len = [] + for line in wav_list: + path = line.strip() + waveform, sampling_rate = torchaudio.load(path) + if sampling_rate != self.frontend.fs: + waveform = torchaudio.transforms.Resample(orig_freq=sampling_rate, + new_freq=self.frontend.fs)(waveform) + fbank, fbank_len = self.frontend(waveform, [waveform.size(1)]) + feats.append(fbank) + feats_len.append(fbank_len) + return feats, feats_len + def export(self, tag_name: str = 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch', mode: str = 'paraformer', @@ -96,6 +185,7 @@ class ASRModelExportParaformer: model, asr_train_args = ASRTask.build_model_from_file( asr_train_config, asr_model_file, cmvn_file, 'cpu' ) + self.frontend = model.frontend self._export(model, tag_name) @@ -107,11 +197,12 @@ class ASRModelExportParaformer: # model_script = torch.jit.script(model) model_script = model #torch.jit.trace(model) + model_path = os.path.join(path, f'{model.model_name}.onnx') torch.onnx.export( model_script, dummy_input, - os.path.join(path, f'{model.model_name}.onnx'), + model_path, verbose=verbose, opset_version=14, input_names=model.get_input_names(), @@ -119,17 +210,42 @@ class ASRModelExportParaformer: dynamic_axes=model.get_dynamic_axes() ) + if self.quant: + from onnxruntime.quantization import QuantType, quantize_dynamic + import onnx + quant_model_path = os.path.join(path, f'{model.model_name}_quant.onnx') + onnx_model = onnx.load(model_path) + nodes = [n.name for n in onnx_model.graph.node] + nodes_to_exclude = [m for m in nodes if 'output' in m] + quantize_dynamic( + model_input=model_path, + model_output=quant_model_path, + op_types_to_quantize=['MatMul'], + per_channel=True, + reduce_range=False, + weight_type=QuantType.QUInt8, + nodes_to_exclude=nodes_to_exclude, + ) + if __name__ == '__main__': - import sys - - model_path = sys.argv[1] - output_dir = sys.argv[2] - onnx = sys.argv[3] - onnx = onnx.lower() - onnx = onnx == 'true' - # model_path = 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' - # output_dir = "../export" - export_model = ASRModelExportParaformer(cache_dir=output_dir, onnx=onnx) - export_model.export(model_path) - # export_model.export('/root/cache/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch') \ No newline at end of file + import argparse + parser = argparse.ArgumentParser() + parser.add_argument('--model-name', type=str, required=True) + parser.add_argument('--export-dir', type=str, required=True) + parser.add_argument('--type', type=str, default='onnx', help='["onnx", "torch"]') + parser.add_argument('--quantize', action='store_true', help='export quantized model') + parser.add_argument('--fallback-num', type=int, default=0, help='amp fallback number') + parser.add_argument('--audio_in', type=str, default=None, help='["wav", "wav.scp"]') + parser.add_argument('--calib_num', type=int, default=200, help='calib max num') + args = parser.parse_args() + + export_model = ASRModelExportParaformer( + cache_dir=args.export_dir, + onnx=args.type == 'onnx', + quant=args.quantize, + fallback_num=args.fallback_num, + audio_in=args.audio_in, + calib_num=args.calib_num, + ) + export_model.export(args.model_name) diff --git a/funasr/export/models/modules/encoder_layer.py b/funasr/export/models/modules/encoder_layer.py index d13257462..7d0139793 100644 --- a/funasr/export/models/modules/encoder_layer.py +++ b/funasr/export/models/modules/encoder_layer.py @@ -16,6 +16,7 @@ class EncoderLayerSANM(nn.Module): self.feed_forward = model.feed_forward self.norm1 = model.norm1 self.norm2 = model.norm2 + self.in_size = model.in_size self.size = model.size def forward(self, x, mask): @@ -23,13 +24,12 @@ class EncoderLayerSANM(nn.Module): residual = x x = self.norm1(x) x = self.self_attn(x, mask) - if x.size(2) == residual.size(2): + if self.in_size == self.size: x = x + residual residual = x x = self.norm2(x) x = self.feed_forward(x) - if x.size(2) == residual.size(2): - x = x + residual + x = x + residual return x, mask diff --git a/funasr/export/models/modules/multihead_att.py b/funasr/export/models/modules/multihead_att.py index 7d685f588..0a5667689 100644 --- a/funasr/export/models/modules/multihead_att.py +++ b/funasr/export/models/modules/multihead_att.py @@ -64,6 +64,21 @@ class MultiHeadedAttentionSANM(nn.Module): return self.linear_out(context_layer) # (batch, time1, d_model) +def preprocess_for_attn(x, mask, cache, pad_fn): + x = x * mask + x = x.transpose(1, 2) + if cache is None: + x = pad_fn(x) + else: + x = torch.cat((cache[:, :, 1:], x), dim=2) + cache = x + return x, cache + + +import torch.fx +torch.fx.wrap('preprocess_for_attn') + + class MultiHeadedAttentionSANMDecoder(nn.Module): def __init__(self, model): super().__init__() @@ -73,16 +88,7 @@ class MultiHeadedAttentionSANMDecoder(nn.Module): self.attn = None def forward(self, inputs, mask, cache=None): - # b, t, d = inputs.size() - # mask = torch.reshape(mask, (b, -1, 1)) - inputs = inputs * mask - - x = inputs.transpose(1, 2) - if cache is None: - x = self.pad_fn(x) - else: - x = torch.cat((cache[:, :, 1:], x), dim=2) - cache = x + x, cache = preprocess_for_attn(inputs, mask, cache, self.pad_fn) x = self.fsmn_block(x) x = x.transpose(1, 2) @@ -232,4 +238,4 @@ class OnnxRelPosMultiHeadedAttention(OnnxMultiHeadedAttention): new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,) context_layer = context_layer.view(new_context_layer_shape) return self.linear_out(context_layer) # (batch, time1, d_model) - \ No newline at end of file + diff --git a/funasr/runtime/python/utils/test_rtf.py b/funasr/runtime/python/utils/test_rtf.py new file mode 100644 index 000000000..3394e8a04 --- /dev/null +++ b/funasr/runtime/python/utils/test_rtf.py @@ -0,0 +1,47 @@ + +import time +import sys +import librosa +backend=sys.argv[1] +model_dir=sys.argv[2] +wav_file=sys.argv[3] + +from torch_paraformer import Paraformer +if backend == "onnxruntime": + from rapid_paraformer import Paraformer + +model = Paraformer(model_dir, batch_size=1, device_id="-1") + +wav_file_f = open(wav_file, 'r') +wav_files = wav_file_f.readlines() + +# warm-up +total = 0.0 +num = 100 +wav_path = wav_files[0].split("\t")[1].strip() if "\t" in wav_files[0] else wav_files[0].split(" ")[1].strip() +for i in range(num): + beg_time = time.time() + result = model(wav_path) + end_time = time.time() + duration = end_time-beg_time + total += duration + print(result) + print("num: {}, time, {}, avg: {}, rtf: {}".format(len(wav_path), duration, total/(i+1), (total/(i+1))/5.53)) + +# infer time +beg_time = time.time() +for i, wav_path_i in enumerate(wav_files): + wav_path = wav_path_i.split("\t")[1].strip() if "\t" in wav_path_i else wav_path_i.split(" ")[1].strip() + result = model(wav_path) +end_time = time.time() +duration = (end_time-beg_time)*1000 +print("total_time_comput_ms: {}".format(int(duration))) + +duration_time = 0.0 +for i, wav_path_i in enumerate(wav_files): + wav_path = wav_path_i.split("\t")[1].strip() if "\t" in wav_path_i else wav_path_i.split(" ")[1].strip() + waveform, _ = librosa.load(wav_path, sr=16000) + duration_time += len(waveform)/16.0 +print("total_time_wav_ms: {}".format(int(duration_time))) + +print("total_rtf: {:.5}".format(duration/duration_time)) \ No newline at end of file diff --git a/funasr/runtime/python/utils/test_rtf.sh b/funasr/runtime/python/utils/test_rtf.sh new file mode 100644 index 000000000..fe13da7d8 --- /dev/null +++ b/funasr/runtime/python/utils/test_rtf.sh @@ -0,0 +1,74 @@ + +nj=64 + +#:< ${local_scp_dir}/log.$JOB.txt + }& + +done +wait + + +rm -rf ${local_scp_dir}/total_time_comput.txt +rm -rf ${local_scp_dir}/total_time_wav.txt +rm -rf ${local_scp_dir}/total_rtf.txt +for JOB in $(seq ${nj}); do + { + cat ${local_scp_dir}/log.$JOB.txt | grep "total_time_comput" | awk -F ' ' '{print $2}' >> ${local_scp_dir}/total_time_comput.txt + cat ${local_scp_dir}/log.$JOB.txt | grep "total_time_wav" | awk -F ' ' '{print $2}' >> ${local_scp_dir}/total_time_wav.txt + cat ${local_scp_dir}/log.$JOB.txt | grep "total_rtf" | awk -F ' ' '{print $2}' >> ${local_scp_dir}/total_rtf.txt + } + +done + +total_time_comput=`cat ${local_scp_dir}/total_time_comput.txt | awk 'BEGIN {max = 0} {if ($1+0>max+0) max=$1 fi} END {print max}'` +total_time_wav=`cat ${local_scp_dir}/total_time_wav.txt | awk '{sum +=$1};END {print sum}'` +rtf=`awk 'BEGIN{printf "%.5f\n",'$total_time_comput'/'$total_time_wav'}'` +speed=`awk 'BEGIN{printf "%.2f\n",1/'$rtf'}'` + +echo "total_time_comput_ms: $total_time_comput" +echo "total_time_wav: $total_time_wav" +echo "total_rtf: $rtf, speech: $speed" \ No newline at end of file