diff --git a/funasr/runtime/triton_gpu/Dockerfile/Dockerfile.server b/funasr/runtime/triton_gpu/Dockerfile/Dockerfile.server new file mode 100644 index 000000000..459195cae --- /dev/null +++ b/funasr/runtime/triton_gpu/Dockerfile/Dockerfile.server @@ -0,0 +1,17 @@ +FROM nvcr.io/nvidia/tritonserver:23.01-py3 +# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html +# Please choose previous tritonserver:xx.xx if you encounter cuda driver mismatch issue + +LABEL maintainer="NVIDIA" +LABEL repository="tritonserver" + +RUN apt-get update && apt-get -y install \ + python3-dev \ + cmake \ + libsndfile1 + +RUN pip3 install kaldifeat pyyaml + +# Dependency for client +RUN pip3 install soundfile grpcio-tools tritonclient pyyaml +WORKDIR /workspace diff --git a/funasr/runtime/triton_gpu/README.md b/funasr/runtime/triton_gpu/README.md new file mode 100644 index 000000000..ebaa819f4 --- /dev/null +++ b/funasr/runtime/triton_gpu/README.md @@ -0,0 +1,52 @@ +## Inference with Triton + +### Steps: +1. Refer here to [get model.onnx](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime#steps) + +2. Follow below instructions to using triton +```sh +# using docker image Dockerfile/Dockerfile.server +docker build . -f Dockerfile/Dockerfile.server -t triton-paraformer:23.01 +docker run -it --rm --name "paraformer_triton_server" --gpus all -v :/workspace --shm-size 1g --net host triton-paraformer:23.01 +# inside the docker container, prepare previous exported model.onnx +mv /workspace/triton_gpu/model_repo_paraformer_large_offline/encoder/1/ + +model_repo_paraformer_large_offline/ +|-- encoder +| |-- 1 +| | `-- model.onnx +| `-- config.pbtxt +|-- feature_extractor +| |-- 1 +| | `-- model.py +| |-- config.pbtxt +| `-- config.yaml +|-- infer_pipeline +| |-- 1 +| `-- config.pbtxt +`-- scoring + |-- 1 + | `-- model.py + |-- config.pbtxt + `-- token_list.pkl + +8 directories, 9 files + +# launch the service +tritonserver --model-repository ./model_repo_paraformer_large_offline \ + --pinned-memory-pool-byte-size=512000000 \ + --cuda-memory-pool-byte-size=0:1024000000 + +``` + +### Performance benchmark + +Benchmark [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) based on Aishell1 test set with a single V100, the total audio duration is 36108.919 seconds. + +(Note: The service has been fully warm up.) +|concurrent-tasks | processing time(s) | RTF | +|----------|--------------------|------------| +| 60 (onnx fp32) | 116.0 | 0.0032| + +## Acknowledge +This part originates from NVIDIA CISI project. We also have TTS and NLP solutions deployed on triton inference server. If you are interested, please contact us. \ No newline at end of file diff --git a/funasr/runtime/triton_gpu/client/client.py b/funasr/runtime/triton_gpu/client/client.py new file mode 100644 index 000000000..6cdbe9204 --- /dev/null +++ b/funasr/runtime/triton_gpu/client/client.py @@ -0,0 +1,191 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import multiprocessing +from multiprocessing import Pool + +import argparse +import os +import tritonclient.grpc as grpcclient +from utils import cal_cer +from speech_client import * +import numpy as np + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "-v", + "--verbose", + action="store_true", + required=False, + default=False, + help="Enable verbose output", + ) + parser.add_argument( + "-u", + "--url", + type=str, + required=False, + default="localhost:10086", + help="Inference server URL. Default is " "localhost:8001.", + ) + parser.add_argument( + "--model_name", + required=False, + default="attention_rescoring", + choices=["attention_rescoring", "streaming_wenet", "infer_pipeline"], + help="the model to send request to", + ) + parser.add_argument( + "--wavscp", + type=str, + required=False, + default=None, + help="audio_id \t wav_path", + ) + parser.add_argument( + "--trans", + type=str, + required=False, + default=None, + help="audio_id \t text", + ) + parser.add_argument( + "--data_dir", + type=str, + required=False, + default=None, + help="path prefix for wav_path in wavscp/audio_file", + ) + parser.add_argument( + "--audio_file", + type=str, + required=False, + default=None, + help="single wav file path", + ) + # below arguments are for streaming + # Please check onnx_config.yaml and train.yaml + parser.add_argument("--streaming", action="store_true", required=False) + parser.add_argument( + "--sample_rate", + type=int, + required=False, + default=16000, + help="sample rate used in training", + ) + parser.add_argument( + "--frame_length_ms", + type=int, + required=False, + default=25, + help="frame length", + ) + parser.add_argument( + "--frame_shift_ms", + type=int, + required=False, + default=10, + help="frame shift length", + ) + parser.add_argument( + "--chunk_size", + type=int, + required=False, + default=16, + help="chunk size default is 16", + ) + parser.add_argument( + "--context", + type=int, + required=False, + default=7, + help="subsampling context", + ) + parser.add_argument( + "--subsampling", + type=int, + required=False, + default=4, + help="subsampling rate", + ) + + FLAGS = parser.parse_args() + print(FLAGS) + + # load data + filenames = [] + transcripts = [] + if FLAGS.audio_file is not None: + path = FLAGS.audio_file + if FLAGS.data_dir: + path = os.path.join(FLAGS.data_dir, path) + if os.path.exists(path): + filenames = [path] + elif FLAGS.wavscp is not None: + audio_data = {} + with open(FLAGS.wavscp, "r", encoding="utf-8") as f: + for line in f: + aid, path = line.strip().split("\t") + if FLAGS.data_dir: + path = os.path.join(FLAGS.data_dir, path) + audio_data[aid] = {"path": path} + with open(FLAGS.trans, "r", encoding="utf-8") as f: + for line in f: + aid, text = line.strip().split("\t") + audio_data[aid]["text"] = text + for key, value in audio_data.items(): + filenames.append(value["path"]) + transcripts.append(value["text"]) + + num_workers = multiprocessing.cpu_count() // 2 + + if FLAGS.streaming: + speech_client_cls = StreamingSpeechClient + else: + speech_client_cls = OfflineSpeechClient + + def single_job(client_files): + with grpcclient.InferenceServerClient( + url=FLAGS.url, verbose=FLAGS.verbose + ) as triton_client: + protocol_client = grpcclient + speech_client = speech_client_cls( + triton_client, FLAGS.model_name, protocol_client, FLAGS + ) + idx, audio_files = client_files + predictions = [] + for li in audio_files: + result = speech_client.recognize(li, idx) + print("Recognized {}:{}".format(li, result[0])) + predictions += result + return predictions + + # start to do inference + # Group requests in batches + predictions = [] + tasks = [] + splits = np.array_split(filenames, num_workers) + + for idx, per_split in enumerate(splits): + cur_files = per_split.tolist() + tasks.append((idx, cur_files)) + + with Pool(processes=num_workers) as pool: + predictions = pool.map(single_job, tasks) + + predictions = [item for sublist in predictions for item in sublist] + if transcripts: + cer = cal_cer(predictions, transcripts) + print("CER is: {}".format(cer)) diff --git a/funasr/runtime/triton_gpu/client/speech_client.py b/funasr/runtime/triton_gpu/client/speech_client.py new file mode 100644 index 000000000..1bafbe246 --- /dev/null +++ b/funasr/runtime/triton_gpu/client/speech_client.py @@ -0,0 +1,142 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from tritonclient.utils import np_to_triton_dtype +import numpy as np +import math +import soundfile as sf + + +class OfflineSpeechClient(object): + def __init__(self, triton_client, model_name, protocol_client, args): + self.triton_client = triton_client + self.protocol_client = protocol_client + self.model_name = model_name + + def recognize(self, wav_file, idx=0): + waveform, sample_rate = sf.read(wav_file) + samples = np.array([waveform], dtype=np.float32) + lengths = np.array([[len(waveform)]], dtype=np.int32) + # better pad waveform to nearest length here + # target_seconds = math.cel(len(waveform) / sample_rate) + # target_samples = np.zeros([1, target_seconds * sample_rate]) + # target_samples[0][0: len(waveform)] = waveform + # samples = target_samples + sequence_id = 10086 + idx + result = "" + inputs = [ + self.protocol_client.InferInput( + "WAV", samples.shape, np_to_triton_dtype(samples.dtype) + ), + self.protocol_client.InferInput( + "WAV_LENS", lengths.shape, np_to_triton_dtype(lengths.dtype) + ), + ] + inputs[0].set_data_from_numpy(samples) + inputs[1].set_data_from_numpy(lengths) + outputs = [self.protocol_client.InferRequestedOutput("TRANSCRIPTS")] + response = self.triton_client.infer( + self.model_name, + inputs, + request_id=str(sequence_id), + outputs=outputs, + ) + result = response.as_numpy("TRANSCRIPTS")[0].decode("utf-8") + return [result] + + +class StreamingSpeechClient(object): + def __init__(self, triton_client, model_name, protocol_client, args): + self.triton_client = triton_client + self.protocol_client = protocol_client + self.model_name = model_name + chunk_size = args.chunk_size + subsampling = args.subsampling + context = args.context + frame_shift_ms = args.frame_shift_ms + frame_length_ms = args.frame_length_ms + # for the first chunk + # we need additional frames to generate + # the exact first chunk length frames + # since the subsampling will look ahead several frames + first_chunk_length = (chunk_size - 1) * subsampling + context + add_frames = math.ceil( + (frame_length_ms - frame_shift_ms) / frame_shift_ms + ) + first_chunk_ms = (first_chunk_length + add_frames) * frame_shift_ms + other_chunk_ms = chunk_size * subsampling * frame_shift_ms + self.first_chunk_in_secs = first_chunk_ms / 1000 + self.other_chunk_in_secs = other_chunk_ms / 1000 + + def recognize(self, wav_file, idx=0): + waveform, sample_rate = sf.read(wav_file) + wav_segs = [] + i = 0 + while i < len(waveform): + if i == 0: + stride = int(self.first_chunk_in_secs * sample_rate) + wav_segs.append(waveform[i : i + stride]) + else: + stride = int(self.other_chunk_in_secs * sample_rate) + wav_segs.append(waveform[i : i + stride]) + i += len(wav_segs[-1]) + + sequence_id = idx + 10086 + # simulate streaming + for idx, seg in enumerate(wav_segs): + chunk_len = len(seg) + if idx == 0: + chunk_samples = int(self.first_chunk_in_secs * sample_rate) + expect_input = np.zeros((1, chunk_samples), dtype=np.float32) + else: + chunk_samples = int(self.other_chunk_in_secs * sample_rate) + expect_input = np.zeros((1, chunk_samples), dtype=np.float32) + + expect_input[0][0:chunk_len] = seg + input0_data = expect_input + input1_data = np.array([[chunk_len]], dtype=np.int32) + + inputs = [ + self.protocol_client.InferInput( + "WAV", + input0_data.shape, + np_to_triton_dtype(input0_data.dtype), + ), + self.protocol_client.InferInput( + "WAV_LENS", + input1_data.shape, + np_to_triton_dtype(input1_data.dtype), + ), + ] + + inputs[0].set_data_from_numpy(input0_data) + inputs[1].set_data_from_numpy(input1_data) + + outputs = [self.protocol_client.InferRequestedOutput("TRANSCRIPTS")] + end = False + if idx == len(wav_segs) - 1: + end = True + + response = self.triton_client.infer( + self.model_name, + inputs, + outputs=outputs, + sequence_id=sequence_id, + sequence_start=idx == 0, + sequence_end=end, + ) + idx += 1 + result = response.as_numpy("TRANSCRIPTS")[0].decode("utf-8") + print("Get response from {}th chunk: {}".format(idx, result)) + return [result] diff --git a/funasr/runtime/triton_gpu/client/test_wavs/long.wav b/funasr/runtime/triton_gpu/client/test_wavs/long.wav new file mode 100644 index 000000000..d2430cacb Binary files /dev/null and b/funasr/runtime/triton_gpu/client/test_wavs/long.wav differ diff --git a/funasr/runtime/triton_gpu/client/test_wavs/mid.wav b/funasr/runtime/triton_gpu/client/test_wavs/mid.wav new file mode 100644 index 000000000..45a4259db Binary files /dev/null and b/funasr/runtime/triton_gpu/client/test_wavs/mid.wav differ diff --git a/funasr/runtime/triton_gpu/client/utils.py b/funasr/runtime/triton_gpu/client/utils.py new file mode 100644 index 000000000..ec102f71d --- /dev/null +++ b/funasr/runtime/triton_gpu/client/utils.py @@ -0,0 +1,60 @@ +import numpy as np + + +def _levenshtein_distance(ref, hyp): + """Levenshtein distance is a string metric for measuring the difference + between two sequences. Informally, the levenshtein disctance is defined as + the minimum number of single-character edits (substitutions, insertions or + deletions) required to change one word into the other. We can naturally + extend the edits to word level when calculate levenshtein disctance for + two sentences. + """ + m = len(ref) + n = len(hyp) + + # special case + if ref == hyp: + return 0 + if m == 0: + return n + if n == 0: + return m + + if m < n: + ref, hyp = hyp, ref + m, n = n, m + + # use O(min(m, n)) space + distance = np.zeros((2, n + 1), dtype=np.int32) + + # initialize distance matrix + for j in range(n + 1): + distance[0][j] = j + + # calculate levenshtein distance + for i in range(1, m + 1): + prev_row_idx = (i - 1) % 2 + cur_row_idx = i % 2 + distance[cur_row_idx][0] = i + for j in range(1, n + 1): + if ref[i - 1] == hyp[j - 1]: + distance[cur_row_idx][j] = distance[prev_row_idx][j - 1] + else: + s_num = distance[prev_row_idx][j - 1] + 1 + i_num = distance[cur_row_idx][j - 1] + 1 + d_num = distance[prev_row_idx][j] + 1 + distance[cur_row_idx][j] = min(s_num, i_num, d_num) + + return distance[m % 2][n] + + +def cal_cer(references, predictions): + errors = 0 + lengths = 0 + for ref, pred in zip(references, predictions): + cur_ref = list(ref) + cur_hyp = list(pred) + cur_error = _levenshtein_distance(cur_ref, cur_hyp) + errors += cur_error + lengths += len(cur_ref) + return float(errors) / lengths diff --git a/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/encoder/1/.gitkeep b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/encoder/1/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/encoder/config.pbtxt b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/encoder/config.pbtxt new file mode 100644 index 000000000..8d8999340 --- /dev/null +++ b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/encoder/config.pbtxt @@ -0,0 +1,61 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "encoder" +backend: "onnxruntime" +default_model_filename: "model.onnx" + +max_batch_size: 64 + +input [ + { + name: "speech" + data_type: TYPE_FP32 + dims: [-1, 560] + }, + { + name: "speech_lengths" + data_type: TYPE_INT32 + dims: [1] + reshape: { shape: [ ] } + } +] + +output [ + { + name: "logits" + data_type: TYPE_FP32 + dims: [-1, 8404] + }, + { + name: "token_num" + data_type: TYPE_INT64 + dims: [1] + reshape: { shape: [ ] } + } +] + +dynamic_batching { + preferred_batch_size: [ 2,4,8,16,32,64 ] + max_queue_delay_microseconds: 500 + } + + +instance_group [ + { + count: 1 + kind: KIND_GPU + } +] + diff --git a/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/feature_extractor/1/model.py b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/feature_extractor/1/model.py new file mode 100644 index 000000000..6464964fd --- /dev/null +++ b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/feature_extractor/1/model.py @@ -0,0 +1,315 @@ +#!/bin/bash +# +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math +import triton_python_backend_utils as pb_utils +from torch.utils.dlpack import to_dlpack +import torch +import numpy as np +import kaldifeat +import _kaldifeat +from typing import List +import json +import yaml +from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union + +class LFR(torch.nn.Module): + """Batch LFR: https://github.com/Mddct/devil-asr/blob/main/patch/lfr.py """ + def __init__(self, m: int = 7, n: int = 6) -> None: + """ + Actually, this implements stacking frames and skipping frames. + if m = 1 and n = 1, just return the origin features. + if m = 1 and n > 1, it works like skipping. + if m > 1 and n = 1, it works like stacking but only support right frames. + if m > 1 and n > 1, it works like LFR. + """ + super().__init__() + + self.m = m + self.n = n + + self.left_padding_nums = math.ceil((self.m - 1) // 2) + + def forward(self, input_tensor: torch.Tensor, + input_lens: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]: + B, _, D = input_tensor.size() + n_lfr = torch.ceil(input_lens / self.n) + + prepad_nums = input_lens + self.left_padding_nums + + right_padding_nums = torch.where( + self.m >= (prepad_nums - self.n * (n_lfr - 1)), + self.m - (prepad_nums - self.n * (n_lfr - 1)), + 0, + ) + + T_all = self.left_padding_nums + input_lens + right_padding_nums + + new_len = T_all // self.n + + T_all_max = T_all.max().int() + + tail_frames_index = (input_lens - 1).view(B, 1, 1).repeat(1, 1, D) # [B,1,D] + + tail_frames = torch.gather(input_tensor, 1, tail_frames_index) + tail_frames = tail_frames.repeat(1, right_padding_nums.max().int(), 1) + head_frames = input_tensor[:, 0:1, :].repeat(1, self.left_padding_nums, 1) + + # stack + input_tensor = torch.cat([head_frames, input_tensor, tail_frames], dim=1) + + index = torch.arange(T_all_max, + device=input_tensor.device, + dtype=input_lens.dtype).unsqueeze(0).repeat(B, 1) # [B, T_all_max] + index_mask = (index < + (self.left_padding_nums + input_lens).unsqueeze(1) + ) #[B, T_all_max] + + tail_index_mask = torch.logical_not( + index >= (T_all.unsqueeze(1))) & index_mask + tail = torch.ones(T_all_max, + dtype=input_lens.dtype, + device=input_tensor.device).unsqueeze(0).repeat(B, 1) * ( + T_all_max - 1) # [B, T_all_max] + indices = torch.where(torch.logical_or(index_mask, tail_index_mask), + index, tail) + input_tensor = torch.gather(input_tensor, 1, indices.unsqueeze(2).repeat(1, 1, D)) + + input_tensor = input_tensor.unfold(1, self.m, step=self.n).transpose(2, 3) + + return input_tensor.reshape(B, -1, D * self.m), new_len + +class WavFrontend(): + """Conventional frontend structure for ASR. + """ + + def __init__( + self, + cmvn_file: str = None, + fs: int = 16000, + window: str = 'hamming', + n_mels: int = 80, + frame_length: int = 25, + frame_shift: int = 10, + filter_length_min: int = -1, + filter_length_max: float = -1, + lfr_m: int = 1, + lfr_n: int = 1, + dither: float = 1.0 + ) -> None: + # check_argument_types() + + self.fs = fs + self.window = window + self.n_mels = n_mels + self.frame_length = frame_length + self.frame_shift = frame_shift + self.filter_length_min = filter_length_min + self.filter_length_max = filter_length_max + self.lfr_m = lfr_m + self.lfr_n = lfr_n + self.lfr = LFR(lfr_m, lfr_n) + self.cmvn_file = cmvn_file + self.dither = dither + + if self.cmvn_file: + self.cmvn = self.load_cmvn() + + def apply_cmvn_batch(self, inputs: np.ndarray) -> np.ndarray: + """ + Apply CMVN with mvn data + """ + batch, frame, dim = inputs.shape + means = np.tile(self.cmvn[0:1, :dim], (frame, 1)) + vars = np.tile(self.cmvn[1:2, :dim], (frame, 1)) + + means = torch.from_numpy(means).to(inputs.device) + vars = torch.from_numpy(vars).to(inputs.device) + # print(inputs.shape, means.shape, vars.shape) + inputs = (inputs + means) * vars + return inputs + + def load_cmvn(self,) -> np.ndarray: + with open(self.cmvn_file, 'r', encoding='utf-8') as f: + lines = f.readlines() + + means_list = [] + vars_list = [] + for i in range(len(lines)): + line_item = lines[i].split() + if line_item[0] == '': + line_item = lines[i + 1].split() + if line_item[0] == '': + add_shift_line = line_item[3:(len(line_item) - 1)] + means_list = list(add_shift_line) + continue + elif line_item[0] == '': + line_item = lines[i + 1].split() + if line_item[0] == '': + rescale_line = line_item[3:(len(line_item) - 1)] + vars_list = list(rescale_line) + continue + + means = np.array(means_list).astype(np.float64) + vars = np.array(vars_list).astype(np.float64) + cmvn = np.array([means, vars]) + return cmvn + + +class Fbank(torch.nn.Module): + def __init__(self, opts): + super(Fbank, self).__init__() + self.fbank = kaldifeat.Fbank(opts) + + def forward(self, waves: List[torch.Tensor]): + return self.fbank(waves) + + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + self.model_config = model_config = json.loads(args['model_config']) + self.max_batch_size = max(model_config["max_batch_size"], 1) + self.device = "cuda" + + # Get OUTPUT0 configuration + output0_config = pb_utils.get_output_config_by_name( + model_config, "speech") + # Convert Triton types to numpy types + output0_dtype = pb_utils.triton_string_to_numpy( + output0_config['data_type']) + + if output0_dtype == np.float32: + self.output0_dtype = torch.float32 + else: + self.output0_dtype = torch.float16 + + # Get OUTPUT1 configuration + output1_config = pb_utils.get_output_config_by_name( + model_config, "speech_lengths") + # Convert Triton types to numpy types + self.output1_dtype = pb_utils.triton_string_to_numpy( + output1_config['data_type']) + + params = self.model_config['parameters'] + + for li in params.items(): + key, value = li + value = value["string_value"] + if key == "config_path": + with open(str(value), 'rb') as f: + config = yaml.load(f, Loader=yaml.Loader) + + opts = kaldifeat.FbankOptions() + opts.frame_opts.dither = 1.0 # TODO: 0.0 or 1.0 + opts.frame_opts.window_type = config['WavFrontend']['frontend_conf']['window'] + opts.mel_opts.num_bins = int(config['WavFrontend']['frontend_conf']['n_mels']) + opts.frame_opts.frame_shift_ms = float(config['WavFrontend']['frontend_conf']['frame_shift']) + opts.frame_opts.frame_length_ms = float(config['WavFrontend']['frontend_conf']['frame_length']) + opts.frame_opts.samp_freq = int(config['WavFrontend']['frontend_conf']['fs']) + opts.device = torch.device(self.device) + self.opts = opts + self.feature_extractor = Fbank(self.opts) + self.feature_size = opts.mel_opts.num_bins + + self.frontend = WavFrontend( + cmvn_file=config['WavFrontend']['cmvn_file'], + **config['WavFrontend']['frontend_conf']) + + def extract_feat(self, + waveform_list: List[np.ndarray] + ) -> Tuple[np.ndarray, np.ndarray]: + feats, feats_len = [], [] + wavs = [] + for waveform in waveform_list: + wav = torch.from_numpy(waveform).float().squeeze().to(self.device) + wavs.append(wav) + + features = self.feature_extractor(wavs) + features_len = [feature.shape[0] for feature in features] + speech = torch.zeros((len(features), max(features_len), self.opts.mel_opts.num_bins), + dtype=self.output0_dtype, device=self.device) + for i, feature in enumerate(features): + speech[i,:int(features_len[i])] = feature + speech_lens = torch.tensor(features_len,dtype=torch.int64).to(self.device) + + feats, feats_len = self.frontend.lfr(speech, speech_lens) + feats_len = feats_len.type(torch.int32) + + feats = self.frontend.apply_cmvn_batch(feats) + feats = feats.type(self.output0_dtype) + + return feats, feats_len + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + batch_count = [] + total_waves = [] + batch_len = [] + responses = [] + for request in requests: + + input0 = pb_utils.get_input_tensor_by_name(request, "wav") + input1 = pb_utils.get_input_tensor_by_name(request, "wav_lens") + + cur_b_wav = input0.as_numpy() * (1 << 15) # b x -1 + total_waves.append(cur_b_wav) + + features, feats_len = self.extract_feat(total_waves) + + for i in range(features.shape[0]): + speech = features[i:i+1][:int(feats_len[i].cpu())] + speech_lengths = feats_len[i].unsqueeze(0).unsqueeze(0) + + speech, speech_lengths = speech.cpu(), speech_lengths.cpu() + out0 = pb_utils.Tensor.from_dlpack("speech", to_dlpack(speech)) + out1 = pb_utils.Tensor.from_dlpack("speech_lengths", + to_dlpack(speech_lengths)) + inference_response = pb_utils.InferenceResponse(output_tensors=[out0, out1]) + responses.append(inference_response) + return responses diff --git a/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/feature_extractor/config.pbtxt b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/feature_extractor/config.pbtxt new file mode 100644 index 000000000..8b5318389 --- /dev/null +++ b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/feature_extractor/config.pbtxt @@ -0,0 +1,77 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "feature_extractor" +backend: "python" +max_batch_size: 64 + +parameters [ + { + key: "num_mel_bins", + value: { string_value: "80"} + }, + { + key: "frame_shift_in_ms" + value: { string_value: "10"} + }, + { + key: "frame_length_in_ms" + value: { string_value: "25"} + }, + { + key: "sample_rate" + value: { string_value: "16000"} + }, + { + key: "config_path" + value: { string_value: "./model_repo_paraformer_large_offline/feature_extractor/config.yaml"} + } + +] + +input [ + { + name: "wav" + data_type: TYPE_FP32 + dims: [-1] + }, + { + name: "wav_lens" + data_type: TYPE_INT32 + dims: [1] + } +] + +output [ + { + name: "speech" + data_type: TYPE_FP32 + dims: [-1, 560] # 80 + }, + { + name: "speech_lengths" + data_type: TYPE_INT32 + dims: [1] + } +] + +dynamic_batching { + } + +instance_group [ + { + count: 2 + kind: KIND_GPU + } +] diff --git a/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/feature_extractor/config.yaml b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/feature_extractor/config.yaml new file mode 100644 index 000000000..a4a66c37a --- /dev/null +++ b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/feature_extractor/config.yaml @@ -0,0 +1,30 @@ +TokenIDConverter: + token_path: resources/models/token_list.pkl + unk_symbol: + +CharTokenizer: + symbol_value: + space_symbol: + remove_non_linguistic_symbols: false + +WavFrontend: + cmvn_file: /raid/dgxsa/yuekaiz/pull_requests/FunASR/funasr/runtime/python/onnxruntime/resources/models/am.mvn + frontend_conf: + fs: 16000 + window: hamming + n_mels: 80 + frame_length: 25 + frame_shift: 10 + lfr_m: 7 + lfr_n: 6 + filter_length_max: -.inf + +Model: + model_path: resources/models/model.onnx + use_cuda: false + CUDAExecutionProvider: + device_id: 0 + arena_extend_strategy: kNextPowerOfTwo + cudnn_conv_algo_search: EXHAUSTIVE + do_copy_in_default_stream: true + batch_size: 3 \ No newline at end of file diff --git a/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/infer_pipeline/1/.gitkeep b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/infer_pipeline/1/.gitkeep new file mode 100644 index 000000000..e69de29bb diff --git a/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/infer_pipeline/config.pbtxt b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/infer_pipeline/config.pbtxt new file mode 100644 index 000000000..1a9c2661e --- /dev/null +++ b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/infer_pipeline/config.pbtxt @@ -0,0 +1,99 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "infer_pipeline" +platform: "ensemble" +max_batch_size: 64 #MAX_BATCH + +input [ + { + name: "WAV" + data_type: TYPE_FP32 + dims: [-1] + }, + { + name: "WAV_LENS" + data_type: TYPE_INT32 + dims: [1] + } +] + +output [ + { + name: "TRANSCRIPTS" + data_type: TYPE_STRING + dims: [1] + } +] + +ensemble_scheduling { + step [ + { + model_name: "feature_extractor" + model_version: -1 + input_map { + key: "wav" + value: "WAV" + } + input_map { + key: "wav_lens" + value: "WAV_LENS" + } + output_map { + key: "speech" + value: "SPEECH" + } + output_map { + key: "speech_lengths" + value: "SPEECH_LENGTHS" + } + }, + { + model_name: "encoder" + model_version: -1 + input_map { + key: "speech" + value: "SPEECH" + } + input_map { + key: "speech_lengths" + value: "SPEECH_LENGTHS" + } + output_map { + key: "logits" + value: "logits" + } + output_map { + key: "token_num" + value: "token_num" + } + }, + { + model_name: "scoring" + model_version: -1 + input_map { + key: "logits" + value: "logits" + } + input_map { + key: "token_num" + value: "token_num" + } + output_map { + key: "OUTPUT0" + value: "TRANSCRIPTS" + } + } + ] +} diff --git a/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/scoring/1/model.py b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/scoring/1/model.py new file mode 100644 index 000000000..dfbaa52f4 --- /dev/null +++ b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/scoring/1/model.py @@ -0,0 +1,150 @@ +#!/bin/bash +# +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import triton_python_backend_utils as pb_utils +import numpy as np +import torch +from torch.utils.dlpack import from_dlpack + +import json +import os + +import pickle + +class TritonPythonModel: + """Your Python model must use the same class name. Every Python model + that is created must have "TritonPythonModel" as the class name. + """ + + def initialize(self, args): + """`initialize` is called only once when the model is being loaded. + Implementing `initialize` function is optional. This function allows + the model to initialize any state associated with this model. + + Parameters + ---------- + args : dict + Both keys and values are strings. The dictionary keys and values are: + * model_config: A JSON string containing the model configuration + * model_instance_kind: A string containing model instance kind + * model_instance_device_id: A string containing model instance device ID + * model_repository: Model repository path + * model_version: Model version + * model_name: Model name + """ + self.model_config = model_config = json.loads(args['model_config']) + self.max_batch_size = max(model_config["max_batch_size"], 1) + + # # Get OUTPUT0 configuration + output0_config = pb_utils.get_output_config_by_name( + model_config, "OUTPUT0") + # # Convert Triton types to numpy types + self.out0_dtype = pb_utils.triton_string_to_numpy( + output0_config['data_type']) + + self.init_vocab(self.model_config['parameters']) + + def init_vocab(self, parameters): + blank_id=0 + for li in parameters.items(): + key, value = li + value = value["string_value"] + if key == "blank_id": + self.blank_id = int(value) + elif key == "lm_path": + lm_path = value + elif key == "vocabulary": + self.vocab_dict = self.load_vocab(value) + if key == 'ignore_id': + ignore_id = int(value) + + def load_vocab(self, vocab_file): + """ + load lang_char.txt + """ + with open(str(vocab_file), 'rb') as f: + token_list = pickle.load(f) + return token_list + + def execute(self, requests): + """`execute` must be implemented in every Python model. `execute` + function receives a list of pb_utils.InferenceRequest as the only + argument. This function is called when an inference is requested + for this model. + + Parameters + ---------- + requests : list + A list of pb_utils.InferenceRequest + + Returns + ------- + list + A list of pb_utils.InferenceResponse. The length of this list must + be the same as `requests` + """ + # Every Python backend must iterate through list of requests and create + # an instance of pb_utils.InferenceResponse class for each of them. You + # should avoid storing any of the input Tensors in the class attributes + # as they will be overridden in subsequent inference requests. You can + # make a copy of the underlying NumPy array and store it if it is + # required. + + total_seq, max_token_num = 0, 0 + assert len(self.vocab_dict) == 8404, len(self.vocab_dict) + logits_list, token_num_list = [], [] + + for request in requests: + # Perform inference on the request and append it to responses list... + in_0 = pb_utils.get_input_tensor_by_name(request, "logits") + in_1 = pb_utils.get_input_tensor_by_name(request, "token_num") + + logits, token_num = from_dlpack(in_0.to_dlpack()), from_dlpack(in_1.to_dlpack()).cpu() + max_token_num = max(max_token_num, token_num) + + assert logits.shape[0] == 1 + logits_list.append(logits) + token_num_list.append(token_num) + total_seq +=1 + + logits_batch = torch.zeros(len(logits_list), max_token_num, len(self.vocab_dict), dtype=torch.float32, device=logits.device) + token_num_batch = torch.zeros(len(logits_list)) + + for i, (logits, token_num) in enumerate(zip(logits_list, token_num_list)): + logits_batch[i][:int(token_num)] = logits[0][:int(token_num)] + token_num_batch[i] = token_num + + yseq_batch = logits_batch.argmax(axis=-1).tolist() + token_int_batch = [list(filter(lambda x: x not in (0, 2), yseq)) for yseq in yseq_batch] + + tokens_batch = [[self.vocab_dict[i] for i in token_int] for token_int in token_int_batch] + + hyps = [u"".join([t if t != "" else " " for t in tokens]).encode('utf-8') for tokens in tokens_batch] + responses = [] + for i in range(total_seq): + sents = np.array(hyps[i:i+1]) + out0 = pb_utils.Tensor("OUTPUT0", sents.astype(self.out0_dtype)) + inference_response = pb_utils.InferenceResponse(output_tensors=[out0]) + responses.append(inference_response) + + return responses + + def finalize(self): + """`finalize` is called only once when the model is being unloaded. + Implementing `finalize` function is optional. This function allows + the model to perform any necessary clean ups before exit. + """ + print('Cleaning up...') diff --git a/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/scoring/config.pbtxt b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/scoring/config.pbtxt new file mode 100644 index 000000000..6b43fe48b --- /dev/null +++ b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/scoring/config.pbtxt @@ -0,0 +1,67 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: "scoring" +backend: "python" +max_batch_size: 64 + +parameters [ + { + key: "ignore_id", + value: { string_value: "-1"} + }, + { + key: "vocabulary", + value: { string_value: "./model_repo_paraformer_large_offline/scoring/token_list.pkl"} + }, + { + key: "lm_path" + value: { string_value: "#lm_path"} + }, + { key: "FORCE_CPU_ONLY_INPUT_TENSORS" + value: {string_value:"no"} + } +] + + +input [ + { + name: "logits" + data_type: TYPE_FP32 + dims: [-1, 8404] + }, + { + name: "token_num" + data_type: TYPE_INT64 + dims: [1] + reshape: { shape: [ ] } + } +] + +output [ + { + name: "OUTPUT0" + data_type: TYPE_STRING + dims: [1] + } +] + +dynamic_batching { + } +instance_group [ + { + count: 2 + kind: KIND_CPU + } + ] diff --git a/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/scoring/token_list.pkl b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/scoring/token_list.pkl new file mode 100644 index 000000000..f1a2ce778 Binary files /dev/null and b/funasr/runtime/triton_gpu/model_repo_paraformer_large_offline/scoring/token_list.pkl differ