Add triton server for SenseVoice (#1901)

* add triton server for SenseVoice

* fix formatting
This commit is contained in:
Yuekai Zhang 2024-07-15 18:43:19 +08:00 committed by GitHub
parent f2ed4b3856
commit 584cfbdc43
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
19 changed files with 1071 additions and 51 deletions

View File

@ -0,0 +1,22 @@
FROM nvcr.io/nvidia/tritonserver:24.05-py3
# https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html
# Please choose previous tritonserver:xx.xx if you encounter cuda driver mismatch issue
LABEL maintainer="NVIDIA"
LABEL repository="tritonserver"
RUN pip install torch
RUN apt-get update && apt-get -y install cmake
WORKDIR /workspace
RUN pip install -U "huggingface_hub[cli]" tritonclient[all] soundfile pyyaml torchaudio sentencepiece
ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0"
RUN git clone https://github.com/csukuangfj/kaldifeat && \
cd kaldifeat && \
sed -i 's/in running_cuda_version//g' get_version.py && \
python3 setup.py install && \
cd -
RUN huggingface-cli download --local-dir ./model_repo_sense_voice_small yuekai/model_repo_sense_voice_small
RUN rm -r ./model_repo_sense_voice_small/.huggingface

View File

@ -1,85 +1,81 @@
## Inference with Triton ## Triton Inference Serving Best Practice for SenseVoice
### Steps: ### Quick Start
1. Prepare model repo files Directly launch the service using docker compose.
```sh ```sh
git-lfs install docker compose up --build
git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git
pretrained_model_dir=$(pwd)/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
cp $pretrained_model_dir/am.mvn ./model_repo_paraformer_large_offline/feature_extractor/
cp $pretrained_model_dir/config.yaml ./model_repo_paraformer_large_offline/feature_extractor/
# Refer here to get model.onnx (https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/export/README.md)
cp <exported_onnx_dir>/model.onnx ./model_repo_paraformer_large_offline/encoder/1/
``` ```
### Build Image
Build the docker image from scratch.
```sh
# build from scratch, cd to the parent dir of Dockerfile.server
docker build . -f Dockerfile/Dockerfile.sensevoice -t soar97/triton-sensevoice:24.05
```
### Create Docker Container
```sh
your_mount_dir=/mnt:/mnt
docker run -it --name "sensevoice-server" --gpus all --net host -v $your_mount_dir --shm-size=2g soar97/triton-sensevoice:24.05
```
### Export SenseVoice Model to Onnx
Please follow the official guide of FunASR to export the sensevoice onnx file. Also, you need to download the tokenizer file by yourself.
### Launch Server
Log of directory tree: Log of directory tree:
```sh ```sh
model_repo_paraformer_large_offline/ model_repo_sense_voice_small
|-- encoder |-- encoder
| |-- 1 | |-- 1
| | `-- model.onnx | | `-- model.onnx -> /your/path/model.onnx
| `-- config.pbtxt | `-- config.pbtxt
|-- feature_extractor |-- feature_extractor
| |-- 1 | |-- 1
| | `-- model.py | | `-- model.py
| |-- config.pbtxt
| |-- am.mvn | |-- am.mvn
| |-- config.pbtxt
| `-- config.yaml | `-- config.yaml
|-- infer_pipeline |-- scoring
| |-- 1 | |-- 1
| | `-- model.py
| |-- chn_jpn_yue_eng_ko_spectok.bpe.model -> /your/path/chn_jpn_yue_eng_ko_spectok.bpe.model
| `-- config.pbtxt | `-- config.pbtxt
`-- scoring `-- sensevoice
|-- 1 |-- 1
| `-- model.py
`-- config.pbtxt `-- config.pbtxt
8 directories, 9 files 8 directories, 10 files
```
2. Follow below instructions to launch triton server
```sh
# using docker image Dockerfile/Dockerfile.server
docker build . -f Dockerfile/Dockerfile.server -t triton-paraformer:23.01
docker run -it --rm --name "paraformer_triton_server" --gpus all -v <path_host/model_repo_paraformer_large_offline>:/workspace/ --shm-size 1g --net host triton-paraformer:23.01
# launch the service # launch the service
tritonserver --model-repository /workspace/model_repo_paraformer_large_offline \ tritonserver --model-repository /workspace/model_repo_sensevoice_small \
--pinned-memory-pool-byte-size=512000000 \ --pinned-memory-pool-byte-size=512000000 \
--cuda-memory-pool-byte-size=0:1024000000 --cuda-memory-pool-byte-size=0:1024000000
``` ```
### Performance benchmark
Benchmark [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) based on Aishell1 test set with a single V100, the total audio duration is 36108.919 seconds.
### Benchmark using Dataset
```sh ```sh
# For client container: git clone https://github.com/yuekaizhang/Triton-ASR-Client.git
docker run -it --rm --name "client_test" --net host --gpus all -v <path_host/triton_gpu/client>:/workpace/ soar97/triton-k2:22.12.1 # noqa cd Triton-ASR-Client
# For aishell manifests: num_task=32
apt-get install git-lfs python3 client.py \
git-lfs install --server-addr localhost \
git clone https://huggingface.co/csukuangfj/aishell-test-dev-manifests --server-port 10086 \
sudo mkdir -p /root/fangjun/open-source/icefall-aishell/egs/aishell/ASR/download/aishell --model-name sensevoice \
tar xf ./aishell-test-dev-manifests/data_aishell.tar.gz -C /root/fangjun/open-source/icefall-aishell/egs/aishell/ASR/download/aishell/ # noqa
serveraddr=localhost
manifest_path=/workspace/aishell-test-dev-manifests/data/fbank/aishell_cuts_test.jsonl.gz
num_task=60
python3 client/decode_manifest_triton.py \
--server-addr $serveraddr \
--compute-cer \ --compute-cer \
--model-name infer_pipeline \
--num-tasks $num_task \ --num-tasks $num_task \
--manifest-filename $manifest_path --batch-size 16 \
--manifest-dir ./datasets/aishell1_test
``` ```
(Note: The service has been fully warm up.) Benchmark results below were based on Aishell1 test set with a single V100, the total audio duration is 36108.919 seconds.
|concurrent-tasks | processing time(s) | RTF | |concurrent-tasks | batch-size-per-task | processing time(s) | RTF |
|----------|--------------------|------------| |----------|--------------------|------------|---------------------|
| 60 (onnx fp32) | 116.0 | 0.0032| | 32 (onnx fp32) | 16 | 67.09 | 0.0019|
| 32 (onnx fp32) | 1 | 82.04 | 0.0023|
(Note: for batch-size-per-task=1 cases, tritonserver could use dynamic batching to improve throughput.)
## Acknowledge ## Acknowledge
This part originates from NVIDIA CISI project. We also have TTS and NLP solutions deployed on triton inference server. If you are interested, please contact us. This part originates from NVIDIA CISI project. We also have TTS and NLP solutions deployed on triton inference server. If you are interested, please contact us.

View File

@ -0,0 +1,85 @@
## Inference with Triton
### Steps:
1. Prepare model repo files
```sh
git-lfs install
git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git
pretrained_model_dir=$(pwd)/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
cp $pretrained_model_dir/am.mvn ./model_repo_paraformer_large_offline/feature_extractor/
cp $pretrained_model_dir/config.yaml ./model_repo_paraformer_large_offline/feature_extractor/
# Refer here to get model.onnx (https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/export/README.md)
cp <exported_onnx_dir>/model.onnx ./model_repo_paraformer_large_offline/encoder/1/
```
Log of directory tree:
```sh
model_repo_paraformer_large_offline/
|-- encoder
| |-- 1
| | `-- model.onnx
| `-- config.pbtxt
|-- feature_extractor
| |-- 1
| | `-- model.py
| |-- config.pbtxt
| |-- am.mvn
| `-- config.yaml
|-- infer_pipeline
| |-- 1
| `-- config.pbtxt
`-- scoring
|-- 1
| `-- model.py
`-- config.pbtxt
8 directories, 9 files
```
2. Follow below instructions to launch triton server
```sh
# using docker image Dockerfile/Dockerfile.server
docker build . -f Dockerfile/Dockerfile.server -t triton-paraformer:23.01
docker run -it --rm --name "paraformer_triton_server" --gpus all -v <path_host/model_repo_paraformer_large_offline>:/workspace/ --shm-size 1g --net host triton-paraformer:23.01
# launch the service
tritonserver --model-repository /workspace/model_repo_paraformer_large_offline \
--pinned-memory-pool-byte-size=512000000 \
--cuda-memory-pool-byte-size=0:1024000000
```
### Performance benchmark
Benchmark [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) based on Aishell1 test set with a single V100, the total audio duration is 36108.919 seconds.
```sh
# For client container:
docker run -it --rm --name "client_test" --net host --gpus all -v <path_host/triton_gpu/client>:/workpace/ soar97/triton-k2:22.12.1 # noqa
# For aishell manifests:
apt-get install git-lfs
git-lfs install
git clone https://huggingface.co/csukuangfj/aishell-test-dev-manifests
sudo mkdir -p /root/fangjun/open-source/icefall-aishell/egs/aishell/ASR/download/aishell
tar xf ./aishell-test-dev-manifests/data_aishell.tar.gz -C /root/fangjun/open-source/icefall-aishell/egs/aishell/ASR/download/aishell/ # noqa
serveraddr=localhost
manifest_path=/workspace/aishell-test-dev-manifests/data/fbank/aishell_cuts_test.jsonl.gz
num_task=60
python3 client/decode_manifest_triton.py \
--server-addr $serveraddr \
--compute-cer \
--model-name infer_pipeline \
--num-tasks $num_task \
--manifest-filename $manifest_path
```
(Note: The service has been fully warm up.)
|concurrent-tasks | processing time(s) | RTF |
|----------|--------------------|------------|
| 60 (onnx fp32) | 116.0 | 0.0032|
## Acknowledge
This part originates from NVIDIA CISI project. We also have TTS and NLP solutions deployed on triton inference server. If you are interested, please contact us.

View File

@ -0,0 +1,18 @@
services:
asr:
image: soar97/triton-sensevoice:24.05
ports:
- "10085:8000"
- "10086:8001"
- "10087:8002"
environment:
- PYTHONIOENCODING=utf-8
deploy:
resources:
reservations:
devices:
- driver: nvidia
device_ids: ['0']
capabilities: [gpu]
command: >
/bin/bash -c "cd ./model_repo_sense_voice_small && bash run.sh"

View File

@ -51,6 +51,7 @@ dynamic_batching {
max_queue_delay_microseconds: 500 max_queue_delay_microseconds: 500
} }
parameters { key: "cudnn_conv_algo_search" value: { string_value: "2" } }
instance_group [ instance_group [
{ {

View File

@ -69,6 +69,8 @@ output [
} }
] ]
parameters { key: "cudnn_conv_algo_search" value: { string_value: "2" } }
instance_group [ instance_group [
{ {
count: 1 count: 1

View File

@ -0,0 +1 @@
/mnt/samsung-t7/yuekai/asr/funaudiollm/SenseVoice/model.onnx

View File

@ -0,0 +1,71 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "encoder"
backend: "onnxruntime"
default_model_filename: "model.onnx"
max_batch_size: 16
input [
{
name: "speech"
data_type: TYPE_FP32
dims: [-1, 560]
},
{
name: "speech_lengths"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
},
{
name: "language"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
},
{
name: "textnorm"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
}
]
output [
{
name: "ctc_logits"
data_type: TYPE_FP32
dims: [-1, 25055]
},
{
name: "encoder_out_lens"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
}
]
dynamic_batching {
}
parameters { key: "cudnn_conv_algo_search" value: { string_value: "2" } }
instance_group [
{
count: 1
kind: KIND_GPU
}
]

View File

@ -0,0 +1,325 @@
#!/bin/bash
#
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import triton_python_backend_utils as pb_utils
from torch.utils.dlpack import to_dlpack
import torch
import numpy as np
import kaldifeat
import _kaldifeat
from typing import List
import json
import yaml
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
class LFR(torch.nn.Module):
"""Batch LFR: https://github.com/Mddct/devil-asr/blob/main/patch/lfr.py"""
def __init__(self, m: int = 7, n: int = 6) -> None:
"""
Actually, this implements stacking frames and skipping frames.
if m = 1 and n = 1, just return the origin features.
if m = 1 and n > 1, it works like skipping.
if m > 1 and n = 1, it works like stacking but only support right frames.
if m > 1 and n > 1, it works like LFR.
"""
super().__init__()
self.m = m
self.n = n
self.left_padding_nums = math.ceil((self.m - 1) // 2)
def forward(
self, input_tensor: torch.Tensor, input_lens: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
B, _, D = input_tensor.size()
n_lfr = torch.ceil(input_lens / self.n)
prepad_nums = input_lens + self.left_padding_nums
right_padding_nums = torch.where(
self.m >= (prepad_nums - self.n * (n_lfr - 1)),
self.m - (prepad_nums - self.n * (n_lfr - 1)),
0,
)
T_all = self.left_padding_nums + input_lens + right_padding_nums
new_len = T_all // self.n
T_all_max = T_all.max().int()
tail_frames_index = (input_lens - 1).view(B, 1, 1).repeat(1, 1, D) # [B,1,D]
tail_frames = torch.gather(input_tensor, 1, tail_frames_index)
tail_frames = tail_frames.repeat(1, right_padding_nums.max().int(), 1)
head_frames = input_tensor[:, 0:1, :].repeat(1, self.left_padding_nums, 1)
# stack
input_tensor = torch.cat([head_frames, input_tensor, tail_frames], dim=1)
index = (
torch.arange(T_all_max, device=input_tensor.device, dtype=input_lens.dtype)
.unsqueeze(0)
.repeat(B, 1)
) # [B, T_all_max]
index_mask = index < (self.left_padding_nums + input_lens).unsqueeze(1) # [B, T_all_max]
tail_index_mask = torch.logical_not(index >= (T_all.unsqueeze(1))) & index_mask
tail = torch.ones(T_all_max, dtype=input_lens.dtype, device=input_tensor.device).unsqueeze(
0
).repeat(B, 1) * (
T_all_max - 1
) # [B, T_all_max]
indices = torch.where(torch.logical_or(index_mask, tail_index_mask), index, tail)
input_tensor = torch.gather(input_tensor, 1, indices.unsqueeze(2).repeat(1, 1, D))
input_tensor = input_tensor.unfold(1, self.m, step=self.n).transpose(2, 3)
return input_tensor.reshape(B, -1, D * self.m), new_len
class WavFrontend:
"""Conventional frontend structure for ASR."""
def __init__(
self,
cmvn_file: str = None,
fs: int = 16000,
window: str = "hamming",
n_mels: int = 80,
frame_length: int = 25,
frame_shift: int = 10,
filter_length_min: int = -1,
filter_length_max: float = -1,
lfr_m: int = 7,
lfr_n: int = 6,
dither: float = 1.0,
) -> None:
self.fs = fs
self.window = window
self.n_mels = n_mels
self.frame_length = frame_length
self.frame_shift = frame_shift
self.filter_length_min = filter_length_min
self.filter_length_max = filter_length_max
self.lfr_m = lfr_m
self.lfr_n = lfr_n
self.lfr = LFR(lfr_m, lfr_n)
self.cmvn_file = cmvn_file
self.dither = dither
if self.cmvn_file:
self.cmvn = self.load_cmvn()
def apply_cmvn_batch(self, inputs: np.ndarray) -> np.ndarray:
"""
Apply CMVN with mvn data
"""
batch, frame, dim = inputs.shape
means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
means = torch.from_numpy(means).to(inputs.device)
vars = torch.from_numpy(vars).to(inputs.device)
inputs = (inputs + means) * vars
return inputs
def load_cmvn(
self,
) -> np.ndarray:
with open(self.cmvn_file, "r", encoding="utf-8") as f:
lines = f.readlines()
means_list = []
vars_list = []
for i in range(len(lines)):
line_item = lines[i].split()
if line_item[0] == "<AddShift>":
line_item = lines[i + 1].split()
if line_item[0] == "<LearnRateCoef>":
add_shift_line = line_item[3 : (len(line_item) - 1)]
means_list = list(add_shift_line)
continue
elif line_item[0] == "<Rescale>":
line_item = lines[i + 1].split()
if line_item[0] == "<LearnRateCoef>":
rescale_line = line_item[3 : (len(line_item) - 1)]
vars_list = list(rescale_line)
continue
means = np.array(means_list).astype(np.float64)
vars = np.array(vars_list).astype(np.float64)
cmvn = np.array([means, vars])
return cmvn
class Fbank(torch.nn.Module):
def __init__(self, opts):
super(Fbank, self).__init__()
self.fbank = kaldifeat.Fbank(opts)
def forward(self, waves: List[torch.Tensor]):
return self.fbank(waves)
class TritonPythonModel:
"""Your Python model must use the same class name. Every Python model
that is created must have "TritonPythonModel" as the class name.
"""
def initialize(self, args):
"""`initialize` is called only once when the model is being loaded.
Implementing `initialize` function is optional. This function allows
the model to initialize any state associated with this model.
Parameters
----------
args : dict
Both keys and values are strings. The dictionary keys and values are:
* model_config: A JSON string containing the model configuration
* model_instance_kind: A string containing model instance kind
* model_instance_device_id: A string containing model instance device ID
* model_repository: Model repository path
* model_version: Model version
* model_name: Model name
"""
self.model_config = model_config = json.loads(args["model_config"])
self.max_batch_size = max(model_config["max_batch_size"], 1)
self.device = "cuda"
# Get OUTPUT0 configuration
output0_config = pb_utils.get_output_config_by_name(model_config, "speech")
# Convert Triton types to numpy types
output0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"])
if output0_dtype == np.float32:
self.output0_dtype = torch.float32
else:
self.output0_dtype = torch.float16
# Get OUTPUT1 configuration
output1_config = pb_utils.get_output_config_by_name(model_config, "speech_lengths")
# Convert Triton types to numpy types
self.output1_dtype = pb_utils.triton_string_to_numpy(output1_config["data_type"])
params = self.model_config["parameters"]
for li in params.items():
key, value = li
value = value["string_value"]
if key == "config_path":
with open(str(value), "rb") as f:
config = yaml.load(f, Loader=yaml.Loader)
if key == "cmvn_path":
cmvn_path = str(value)
config["frontend_conf"]["cmvn_file"] = cmvn_path
opts = kaldifeat.FbankOptions()
opts.frame_opts.dither = 1.0 # TODO: 0.0 or 1.0
opts.frame_opts.window_type = config["frontend_conf"]["window"]
opts.mel_opts.num_bins = int(config["frontend_conf"]["n_mels"])
opts.frame_opts.frame_shift_ms = float(config["frontend_conf"]["frame_shift"])
opts.frame_opts.frame_length_ms = float(config["frontend_conf"]["frame_length"])
opts.frame_opts.samp_freq = int(config["frontend_conf"]["fs"])
opts.device = torch.device(self.device)
self.opts = opts
self.feature_extractor = Fbank(self.opts)
self.feature_size = opts.mel_opts.num_bins
self.frontend = WavFrontend(**config["frontend_conf"])
def extract_feat(self, waveform_list: List[np.ndarray]) -> Tuple[np.ndarray, np.ndarray]:
feats, feats_len = [], []
wavs = []
for waveform in waveform_list:
wav = torch.from_numpy(waveform).float().squeeze().to(self.device)
wavs.append(wav)
features = self.feature_extractor(wavs)
features_len = [feature.shape[0] for feature in features]
speech = torch.zeros(
(len(features), max(features_len), self.opts.mel_opts.num_bins),
dtype=self.output0_dtype,
device=self.device,
)
for i, feature in enumerate(features):
speech[i, : int(features_len[i])] = feature
speech_lens = torch.tensor(features_len, dtype=torch.int64).to(self.device)
feats, feats_len = self.frontend.lfr(speech, speech_lens)
feats_len = feats_len.type(torch.int32)
feats = self.frontend.apply_cmvn_batch(feats)
feats = feats.type(self.output0_dtype)
return feats, feats_len
def execute(self, requests):
"""`execute` must be implemented in every Python model. `execute`
function receives a list of pb_utils.InferenceRequest as the only
argument. This function is called when an inference is requested
for this model.
Parameters
----------
requests : list
A list of pb_utils.InferenceRequest
Returns
-------
list
A list of pb_utils.InferenceResponse. The length of this list must
be the same as `requests`
"""
batch_count = []
total_waves = []
batch_len = []
responses = []
for request in requests:
input0 = pb_utils.get_input_tensor_by_name(request, "wav")
input1 = pb_utils.get_input_tensor_by_name(request, "wav_lens")
cur_b_wav = input0.as_numpy() * (1 << 15) # b x -1
# remove paddings, however, encoder may can't batch requests since different lengths.
# cur_b_wav = cur_b_wav[:, : int(input1.as_numpy()[0])]
batch_count.append(cur_b_wav.shape[0])
# convert the bx-1 numpy array into a 1x-1 list of arrays
cur_b_wav_list = [np.expand_dims(cur_b_wav[i], 0) for i in range(cur_b_wav.shape[0])]
total_waves.extend(cur_b_wav_list)
features, feats_len = self.extract_feat(total_waves)
i = 0
for batch in batch_count:
speech = features[i : i + batch]
speech_lengths = feats_len[i : i + batch].unsqueeze(1)
speech, speech_lengths = speech.cpu(), speech_lengths.cpu()
out0 = pb_utils.Tensor.from_dlpack("speech", to_dlpack(speech))
out1 = pb_utils.Tensor.from_dlpack("speech_lengths", to_dlpack(speech_lengths))
inference_response = pb_utils.InferenceResponse(output_tensors=[out0, out1])
responses.append(inference_response)
i += batch
return responses

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,81 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "feature_extractor"
backend: "python"
max_batch_size: 16
parameters [
{
key: "num_mel_bins",
value: { string_value: "80"}
},
{
key: "frame_shift_in_ms"
value: { string_value: "10"}
},
{
key: "frame_length_in_ms"
value: { string_value: "25"}
},
{
key: "sample_rate"
value: { string_value: "16000"}
},
{
key: "cmvn_path"
value: { string_value: "./model_repo_sense_voice_small/feature_extractor/am.mvn"}
},
{
key: "config_path"
value: { string_value: "./model_repo_sense_voice_small/feature_extractor/config.yaml"}
}
]
input [
{
name: "wav"
data_type: TYPE_FP32
dims: [-1]
},
{
name: "wav_lens"
data_type: TYPE_INT32
dims: [1]
}
]
output [
{
name: "speech"
data_type: TYPE_FP32
dims: [-1, 560] # 80
},
{
name: "speech_lengths"
data_type: TYPE_INT32
dims: [1]
}
]
dynamic_batching {
}
instance_group [
{
count: 2
kind: KIND_GPU
}
]

View File

@ -0,0 +1,97 @@
encoder: SenseVoiceEncoderSmall
encoder_conf:
output_size: 512
attention_heads: 4
linear_units: 2048
num_blocks: 50
tp_blocks: 20
dropout_rate: 0.1
positional_dropout_rate: 0.1
attention_dropout_rate: 0.1
input_layer: pe
pos_enc_class: SinusoidalPositionEncoder
normalize_before: true
kernel_size: 11
sanm_shfit: 0
selfattention_layer_type: sanm
model: SenseVoiceSmall
model_conf:
length_normalized_loss: true
sos: 1
eos: 2
ignore_id: -1
tokenizer: SentencepiecesTokenizer
tokenizer_conf:
bpemodel: null
unk_symbol: <unk>
split_with_space: true
frontend: WavFrontend
frontend_conf:
fs: 16000
window: hamming
n_mels: 80
frame_length: 25
frame_shift: 10
lfr_m: 7
lfr_n: 6
cmvn_file: null
dataset: SenseVoiceCTCDataset
dataset_conf:
index_ds: IndexDSJsonl
batch_sampler: EspnetStyleBatchSampler
data_split_num: 32
batch_type: token
batch_size: 14000
max_token_length: 2000
min_token_length: 60
max_source_length: 2000
min_source_length: 60
max_target_length: 200
min_target_length: 0
shuffle: true
num_workers: 4
sos: ${model_conf.sos}
eos: ${model_conf.eos}
IndexDSJsonl: IndexDSJsonl
retry: 20
train_conf:
accum_grad: 1
grad_clip: 5
max_epoch: 20
keep_nbest_models: 10
avg_nbest_model: 10
log_interval: 100
resume: true
validate_interval: 10000
save_checkpoint_interval: 10000
optim: adamw
optim_conf:
lr: 0.00002
scheduler: warmuplr
scheduler_conf:
warmup_steps: 25000
specaug: SpecAugLFR
specaug_conf:
apply_time_warp: false
time_warp_window: 5
time_warp_mode: bicubic
apply_freq_mask: true
freq_mask_width_range:
- 0
- 30
lfr_rate: 6
num_freq_mask: 1
apply_time_mask: true
time_mask_width_range:
- 0
- 12
num_time_mask: 1

View File

@ -0,0 +1,136 @@
#!/bin/bash
#
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import triton_python_backend_utils as pb_utils
import numpy as np
import torch
from torch.utils.dlpack import from_dlpack
import json
import os
import yaml
import sentencepiece as spm
class TritonPythonModel:
"""Your Python model must use the same class name. Every Python model
that is created must have "TritonPythonModel" as the class name.
"""
def initialize(self, args):
"""`initialize` is called only once when the model is being loaded.
Implementing `initialize` function is optional. This function allows
the model to initialize any state associated with this model.
Parameters
----------
args : dict
Both keys and values are strings. The dictionary keys and values are:
* model_config: A JSON string containing the model configuration
* model_instance_kind: A string containing model instance kind
* model_instance_device_id: A string containing model instance device ID
* model_repository: Model repository path
* model_version: Model version
* model_name: Model name
"""
self.model_config = model_config = json.loads(args["model_config"])
self.max_batch_size = max(model_config["max_batch_size"], 1)
# # Get OUTPUT0 configuration
output0_config = pb_utils.get_output_config_by_name(model_config, "OUTPUT0")
# # Convert Triton types to numpy types
self.out0_dtype = pb_utils.triton_string_to_numpy(output0_config["data_type"])
self.init_tokenizer(self.model_config["parameters"])
def init_tokenizer(self, parameters):
for li in parameters.items():
key, value = li
value = value["string_value"]
if key == "tokenizer_path":
tokenizer_path = value
self.tokenizer = spm.SentencePieceProcessor()
self.tokenizer.Load(tokenizer_path)
def execute(self, requests):
"""`execute` must be implemented in every Python model. `execute`
function receives a list of pb_utils.InferenceRequest as the only
argument. This function is called when an inference is requested
for this model.
Parameters
----------
requests : list
A list of pb_utils.InferenceRequest
Returns
-------
list
A list of pb_utils.InferenceResponse. The length of this list must
be the same as `requests`
"""
# Every Python backend must iterate through list of requests and create
# an instance of pb_utils.InferenceResponse class for each of them. You
# should avoid storing any of the input Tensors in the class attributes
# as they will be overridden in subsequent inference requests. You can
# make a copy of the underlying NumPy array and store it if it is
# required.
total_seq = 0
logits_list, batch_count = [], []
for request in requests:
# Perform inference on the request and append it to responses list...
in_0 = pb_utils.get_input_tensor_by_name(request, "ctc_logits")
logits = from_dlpack(in_0.to_dlpack())
logits_list.append(logits)
total_seq += logits.shape[0]
batch_count.append(logits.shape[0])
logits_batch = torch.cat(logits_list, dim=0)
yseq_batch = logits_batch.argmax(axis=-1)
yseq_batch = torch.unique_consecutive(yseq_batch, dim=-1)
yseq_batch = yseq_batch.tolist()
# Remove blank_id and EOS tokens
token_int_batch = [list(filter(lambda x: x not in (0, 2), yseq)) for yseq in yseq_batch]
hyps = []
for i, token_int in enumerate(token_int_batch):
hyp = self.tokenizer.DecodeIds(token_int)
hyps.append(hyp)
responses = []
i = 0
for batch in batch_count:
sents = np.array(hyps[i : i + batch])
out0 = pb_utils.Tensor("OUTPUT0", sents.astype(self.out0_dtype))
inference_response = pb_utils.InferenceResponse(output_tensors=[out0])
responses.append(inference_response)
i += batch
return responses
def finalize(self):
"""`finalize` is called only once when the model is being unloaded.
Implementing `finalize` function is optional. This function allows
the model to perform any necessary clean ups before exit.
"""
print("Cleaning up...")

View File

@ -0,0 +1 @@
/mnt/samsung-t7/yuekai/asr/funaudiollm/SenseVoiceSmall/chn_jpn_yue_eng_ko_spectok.bpe.model

View File

@ -0,0 +1,59 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "scoring"
backend: "python"
max_batch_size: 16
parameters [
{
key: "tokenizer_path",
value: { string_value: "./model_repo_sense_voice_small/scoring/chn_jpn_yue_eng_ko_spectok.bpe.model"}
},
{ key: "FORCE_CPU_ONLY_INPUT_TENSORS"
value: {string_value:"no"}
}
]
input [
{
name: "ctc_logits"
data_type: TYPE_FP32
dims: [-1, 25055]
},
{
name: "encoder_out_lens"
data_type: TYPE_INT32
dims: [1]
reshape: { shape: [ ] }
}
]
output [
{
name: "OUTPUT0"
data_type: TYPE_STRING
dims: [1]
}
]
dynamic_batching {
}
instance_group [
{
count: 2
kind: KIND_CPU
}
]

View File

@ -0,0 +1,117 @@
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
name: "sensevoice"
platform: "ensemble"
max_batch_size: 16
input [
{
name: "WAV"
data_type: TYPE_FP32
dims: [-1]
},
{
name: "WAV_LENS"
data_type: TYPE_INT32
dims: [1]
},
{
name: "LANGUAGE"
data_type: TYPE_INT32
dims: [1]
},
{
name: "TEXT_NORM"
data_type: TYPE_INT32
dims: [1]
}
]
output [
{
name: "TRANSCRIPTS"
data_type: TYPE_STRING
dims: [1]
}
]
ensemble_scheduling {
step [
{
model_name: "feature_extractor"
model_version: -1
input_map {
key: "wav"
value: "WAV"
}
input_map {
key: "wav_lens"
value: "WAV_LENS"
}
output_map {
key: "speech"
value: "SPEECH"
}
output_map {
key: "speech_lengths"
value: "SPEECH_LENGTHS"
}
},
{
model_name: "encoder"
model_version: -1
input_map {
key: "speech"
value: "SPEECH"
}
input_map {
key: "speech_lengths"
value: "SPEECH_LENGTHS"
}
input_map {
key: "language"
value: "LANGUAGE"
}
input_map {
key: "textnorm"
value: "TEXT_NORM"
}
output_map {
key: "ctc_logits"
value: "ctc_logits"
}
output_map {
key: "encoder_out_lens"
value: "encoder_out_lens"
}
},
{
model_name: "scoring"
model_version: -1
input_map {
key: "ctc_logits"
value: "ctc_logits"
}
input_map {
key: "encoder_out_lens"
value: "encoder_out_lens"
}
output_map {
key: "OUTPUT0"
value: "TRANSCRIPTS"
}
}
]
}