diff --git a/funasr/runtime/python/onnxruntime/.gitignore b/funasr/runtime/python/onnxruntime/.gitignore new file mode 100644 index 000000000..c97db36d7 --- /dev/null +++ b/funasr/runtime/python/onnxruntime/.gitignore @@ -0,0 +1,3 @@ +**/__pycache__ +*.onnx +*.pyc \ No newline at end of file diff --git a/funasr/runtime/python/onnxruntime/README.md b/funasr/runtime/python/onnxruntime/README.md new file mode 100644 index 000000000..d0fedb9aa --- /dev/null +++ b/funasr/runtime/python/onnxruntime/README.md @@ -0,0 +1,61 @@ +## Using paraformer with ONNXRuntime + +
+ +### Introduction +- Model comes from [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary). + + +### Steps: +1. Download the whole directory (`funasr/runtime/python/onnxruntime`) to the local. +2. Install the related packages. + ```bash + pip install requirements.txt + ``` +3. Download the model. + - [Download Link](https://swap.oss-cn-hangzhou.aliyuncs.com/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/model.onnx?OSSAccessKeyId=LTAI4FxMqzhBUx5XD4mKs296&Expires=2036094510&Signature=agmtMkxLEviGg3Rt3gOO4PvfrJY%3D) + - Put the model into the `resources/models`. + ```text + . + ├── demo.py + ├── rapid_paraformer + │ ├── __init__.py + │ ├── kaldifeat + │ ├── __pycache__ + │ ├── rapid_paraformer.py + │ └── utils.py + ├── README.md + ├── requirements.txt + ├── resources + │ ├── config.yaml + │ └── models + │ ├── am.mvn + │ ├── model.onnx # Put it here. + │ └── token_list.pkl + ├── test_onnx.py + ├── tests + │ ├── __pycache__ + │ └── test_infer.py + └── test_wavs + ├── 0478_00017.wav + └── asr_example_zh.wav + ``` +4. Run the demo. + - Input: wav formt file, support formats: `str, np.ndarray, List[str]` + - Output: `List[str]`: recognition result. + - Example: + ```python + from rapid_paraformer import RapidParaformer + + + config_path = 'resources/config.yaml' + paraformer = RapidParaformer(config_path) + + wav_path = ['test_wavs/0478_00017.wav'] + + result = paraformer(wav_path) + print(result) + ``` \ No newline at end of file diff --git a/funasr/runtime/python/onnxruntime/rapid_paraformer/__init__.py b/funasr/runtime/python/onnxruntime/rapid_paraformer/__init__.py new file mode 100644 index 000000000..2640f5f5e --- /dev/null +++ b/funasr/runtime/python/onnxruntime/rapid_paraformer/__init__.py @@ -0,0 +1,4 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +from .rapid_paraformer import RapidParaformer diff --git a/funasr/runtime/python/onnxruntime/rapid_paraformer/kaldifeat/LICENSE b/funasr/runtime/python/onnxruntime/rapid_paraformer/kaldifeat/LICENSE new file mode 100644 index 000000000..261eeb9e9 --- /dev/null +++ b/funasr/runtime/python/onnxruntime/rapid_paraformer/kaldifeat/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/funasr/runtime/python/onnxruntime/rapid_paraformer/kaldifeat/__init__.py b/funasr/runtime/python/onnxruntime/rapid_paraformer/kaldifeat/__init__.py new file mode 100644 index 000000000..f9cf27344 --- /dev/null +++ b/funasr/runtime/python/onnxruntime/rapid_paraformer/kaldifeat/__init__.py @@ -0,0 +1,3 @@ +# -*- encoding: utf-8 -*- +from .feature import compute_fbank_feats, compute_mfcc_feats, apply_cmvn_sliding +from .ivector import compute_vad diff --git a/funasr/runtime/python/onnxruntime/rapid_paraformer/kaldifeat/feature.py b/funasr/runtime/python/onnxruntime/rapid_paraformer/kaldifeat/feature.py new file mode 100644 index 000000000..fb5f9a552 --- /dev/null +++ b/funasr/runtime/python/onnxruntime/rapid_paraformer/kaldifeat/feature.py @@ -0,0 +1,459 @@ +import numpy as np +from scipy.fftpack import dct + + +# ---------- feature-window ---------- + +def sliding_window(x, window_size, window_shift): + shape = x.shape[:-1] + (x.shape[-1] - window_size + 1, window_size) + strides = x.strides + (x.strides[-1],) + return np.lib.stride_tricks.as_strided(x, shape=shape, strides=strides)[::window_shift] + + +def func_num_frames(num_samples, window_size, window_shift, snip_edges): + if snip_edges: + if num_samples < window_size: + return 0 + else: + return 1 + ((num_samples - window_size) // window_shift) + else: + return (num_samples + (window_shift // 2)) // window_shift + + +def func_dither(waveform, dither_value): + if dither_value == 0.0: + return waveform + waveform += np.random.normal(size=waveform.shape).astype(waveform.dtype) * dither_value + return waveform + + +def func_remove_dc_offset(waveform): + return waveform - np.mean(waveform) + + +def func_log_energy(waveform): + return np.log(np.dot(waveform, waveform).clip(min=np.finfo(waveform.dtype).eps)) + + +def func_preemphasis(waveform, preemph_coeff): + if preemph_coeff == 0.0: + return waveform + assert 0 < preemph_coeff <= 1 + waveform[1:] -= preemph_coeff * waveform[:-1] + waveform[0] -= preemph_coeff * waveform[0] + return waveform + + +def sine(M): + if M < 1: + return np.array([]) + if M == 1: + return np.ones(1, float) + n = np.arange(0, M) + return np.sin(np.pi*n/(M-1)) + + +def povey(M): + if M < 1: + return np.array([]) + if M == 1: + return np.ones(1, float) + n = np.arange(0, M) + return (0.5 - 0.5*np.cos(2.0*np.pi*n/(M-1)))**0.85 + + +def feature_window_function(window_type, window_size, blackman_coeff): + assert window_size > 0 + if window_type == 'hanning': + return np.hanning(window_size) + elif window_type == 'sine': + return sine(window_size) + elif window_type == 'hamming': + return np.hamming(window_size) + elif window_type == 'povey': + return povey(window_size) + elif window_type == 'rectangular': + return np.ones(window_size) + elif window_type == 'blackman': + window_func = np.blackman(window_size) + if blackman_coeff == 0.42: + return window_func + else: + return window_func - 0.42 + blackman_coeff + else: + raise ValueError('Invalid window type {}'.format(window_type)) + + +def process_window(window, dither, remove_dc_offset, preemphasis_coefficient, window_function, raw_energy): + if dither != 0.0: + window = func_dither(window, dither) + if remove_dc_offset: + window = func_remove_dc_offset(window) + if raw_energy: + log_energy = func_log_energy(window) + if preemphasis_coefficient != 0.0: + window = func_preemphasis(window, preemphasis_coefficient) + window *= window_function + if not raw_energy: + log_energy = func_log_energy(window) + return window, log_energy + + +def extract_window(waveform, blackman_coeff, dither, window_size, window_shift, + preemphasis_coefficient, raw_energy, remove_dc_offset, + snip_edges, window_type, dtype): + num_samples = len(waveform) + num_frames = func_num_frames(num_samples, window_size, window_shift, snip_edges) + num_samples_ = (num_frames - 1) * window_shift + window_size + if snip_edges: + waveform = waveform[:num_samples_] + else: + offset = window_shift // 2 - window_size // 2 + waveform = np.concatenate([ + waveform[-offset - 1::-1], + waveform, + waveform[:-(offset + num_samples_ - num_samples + 1):-1] + ]) + frames = sliding_window(waveform, window_size=window_size, window_shift=window_shift) + frames = frames.astype(dtype) + log_enery = np.empty(frames.shape[0], dtype=dtype) + for i in range(frames.shape[0]): + frames[i], log_enery[i] = process_window( + window=frames[i], + dither=dither, + remove_dc_offset=remove_dc_offset, + preemphasis_coefficient=preemphasis_coefficient, + window_function=feature_window_function( + window_type=window_type, + window_size=window_size, + blackman_coeff=blackman_coeff + ).astype(dtype), + raw_energy=raw_energy + ) + return frames, log_enery + +# ---------- feature-window ---------- + + +# ---------- feature-functions ---------- + +def compute_spectrum(frames, n): + complex_spec = np.fft.rfft(frames, n) + return np.absolute(complex_spec) + + +def compute_power_spectrum(frames, n): + return np.square(compute_spectrum(frames, n)) + + +def apply_cmvn_sliding_internal(feat, center=False, window=600, min_window=100, norm_vars=False): + num_frames, feat_dim = feat.shape + std = 1 + if center: + if num_frames <= window: + mean = feat.mean(axis=0, keepdims=True).repeat(num_frames, axis=0) + if norm_vars: + std = feat.std(axis=0, keepdims=True).repeat(num_frames, axis=0) + else: + feat1 = feat[:window] + feat2 = sliding_window(feat.T, window, 1) + feat3 = feat[-window:] + mean1 = feat1.mean(axis=0, keepdims=True).repeat(window // 2, axis=0) + mean2 = feat2.mean(axis=2).T + mean3 = feat3.mean(axis=0, keepdims=True).repeat((window - 1) // 2, axis=0) + mean = np.concatenate([mean1, mean2, mean3]) + if norm_vars: + std1 = feat1.std(axis=0, keepdims=True).repeat(window // 2, axis=0) + std2 = feat2.std(axis=2).T + std3 = feat3.mean(axis=0, keepdims=True).repeat((window - 1) // 2, axis=0) + std = np.concatenate([std1, std2, std3]) + else: + if num_frames <= min_window: + mean = feat.mean(axis=0, keepdims=True).repeat(num_frames, axis=0) + if norm_vars: + std = feat.std(axis=0, keepdims=True).repeat(num_frames, axis=0) + else: + feat1 = feat[:min_window] + mean1 = feat1.mean(axis=0, keepdims=True).repeat(min_window, axis=0) + feat2_cumsum = np.cumsum(feat[:window], axis=0)[min_window:] + cumcnt = np.arange(min_window + 1, min(window, num_frames) + 1, dtype=feat.dtype)[:, np.newaxis] + mean2 = feat2_cumsum / cumcnt + mean = np.concatenate([mean1, mean2]) + if norm_vars: + std1 = feat1.std(axis=0, keepdims=True).repeat(min_window, axis=0) + feat2_power_cumsum = np.cumsum(np.square(feat[:window]), axis=0)[min_window:] + std2 = np.sqrt(feat2_power_cumsum / cumcnt - np.square(mean2)) + std = np.concatenate([std1, std2]) + if num_frames > window: + feat3 = sliding_window(feat.T, window, 1) + mean3 = feat3.mean(axis=2).T + mean = np.concatenate([mean, mean3[1:]]) + if norm_vars: + std3 = feat3.std(axis=2).T + std = np.concatenate([std, std3[1:]]) + feat = (feat - mean) / std + return feat + +# ---------- feature-functions ---------- + + +# ---------- mel-computations ---------- + +def inverse_mel_scale(mel_freq): + return 700.0 * (np.exp(mel_freq / 1127.0) - 1.0) + + +def mel_scale(freq): + return 1127.0 * np.log(1.0 + freq / 700.0) + + +def compute_mel_banks(num_bins, sample_frequency, low_freq, high_freq, n): + """ Compute Mel banks. + + :param num_bins: Number of triangular mel-frequency bins + :param sample_frequency: Waveform data sample frequency + :param low_freq: Low cutoff frequency for mel bins + :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) + :param n: Window size + :return: Mel banks. + """ + assert num_bins >= 3, 'Must have at least 3 mel bins' + num_fft_bins = n // 2 + + nyquist = 0.5 * sample_frequency + if high_freq <= 0: + high_freq = nyquist + high_freq + assert 0 <= low_freq < high_freq <= nyquist + + fft_bin_width = sample_frequency / n + + mel_low_freq = mel_scale(low_freq) + mel_high_freq = mel_scale(high_freq) + mel_freq_delta = (mel_high_freq - mel_low_freq) / (num_bins + 1) + + mel_banks = np.zeros([num_bins, num_fft_bins + 1]) + for i in range(num_bins): + left_mel = mel_low_freq + mel_freq_delta * i + center_mel = left_mel + mel_freq_delta + right_mel = center_mel + mel_freq_delta + for j in range(num_fft_bins): + mel = mel_scale(fft_bin_width * j) + if left_mel < mel < right_mel: + if mel <= center_mel: + mel_banks[i, j] = (mel - left_mel) / (center_mel - left_mel) + else: + mel_banks[i, j] = (right_mel - mel) / (right_mel - center_mel) + return mel_banks + + +def compute_lifter_coeffs(q, M): + """ Compute liftering coefficients (scaling on cepstral coeffs) + the zeroth index is C0, which is not affected. + + :param q: Number of lifters + :param M: Number of coefficients + :return: Lifters. + """ + if M < 1: + return np.array([]) + if M == 1: + return np.ones(1, float) + n = np.arange(0, M) + return 1 + 0.5*np.sin(np.pi*n/q)*q + +# ---------- mel-computations ---------- + + +# ---------- compute-fbank-feats ---------- + +def compute_fbank_feats( + waveform, + blackman_coeff=0.42, + dither=1.0, + energy_floor=0.0, + frame_length=25, + frame_shift=10, + high_freq=0, + low_freq=20, + num_mel_bins=23, + preemphasis_coefficient=0.97, + raw_energy=True, + remove_dc_offset=True, + round_to_power_of_two=True, + sample_frequency=16000, + snip_edges=True, + use_energy=False, + use_log_fbank=True, + use_power=True, + window_type='povey', + dtype=np.float32): + """ Compute (log) Mel filter bank energies + + :param waveform: Input waveform. + :param blackman_coeff: Constant coefficient for generalized Blackman window. (float, default = 0.42) + :param dither: Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1) + :param energy_floor: Floor on energy (absolute, not relative) in FBANK computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0) + :param frame_length: Frame length in milliseconds (float, default = 25) + :param frame_shift: Frame shift in milliseconds (float, default = 10) + :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0) + :param low_freq: Low cutoff frequency for mel bins (float, default = 20) + :param num_mel_bins: Number of triangular mel-frequency bins (int, default = 23) + :param preemphasis_coefficient: Coefficient for use in signal preemphasis (float, default = 0.97) + :param raw_energy: If true, compute energy before preemphasis and windowing (bool, default = true) + :param remove_dc_offset: Subtract mean from waveform on each frame (bool, default = true) + :param round_to_power_of_two: If true, round window size to power of two by zero-padding input to FFT. (bool, default = true) + :param sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000) + :param snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true) + :param use_energy: Add an extra energy output. (bool, default = false) + :param use_log_fbank: If true, produce log-filterbank, else produce linear. (bool, default = true) + :param use_power: If true, use power, else use magnitude. (bool, default = true) + :param window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey") + :param dtype: Type of array (np.float32|np.float64) (dtype or string, default=np.float32) + :return: (Log) Mel filter bank energies. + """ + window_size = int(frame_length * sample_frequency * 0.001) + window_shift = int(frame_shift * sample_frequency * 0.001) + frames, log_energy = extract_window( + waveform=waveform, + blackman_coeff=blackman_coeff, + dither=dither, + window_size=window_size, + window_shift=window_shift, + preemphasis_coefficient=preemphasis_coefficient, + raw_energy=raw_energy, + remove_dc_offset=remove_dc_offset, + snip_edges=snip_edges, + window_type=window_type, + dtype=dtype + ) + if round_to_power_of_two: + n = 1 + while n < window_size: + n *= 2 + else: + n = window_size + if use_power: + spectrum = compute_power_spectrum(frames, n) + else: + spectrum = compute_spectrum(frames, n) + mel_banks = compute_mel_banks( + num_bins=num_mel_bins, + sample_frequency=sample_frequency, + low_freq=low_freq, + high_freq=high_freq, + n=n + ).astype(dtype) + feat = np.dot(spectrum, mel_banks.T) + if use_log_fbank: + feat = np.log(feat.clip(min=np.finfo(dtype).eps)) + if use_energy: + if energy_floor > 0.0: + log_energy.clip(min=np.math.log(energy_floor)) + return feat, log_energy + return feat + +# ---------- compute-fbank-feats ---------- + + +# ---------- compute-mfcc-feats ---------- + +def compute_mfcc_feats( + waveform, + blackman_coeff=0.42, + cepstral_lifter=22, + dither=1.0, + energy_floor=0.0, + frame_length=25, + frame_shift=10, + high_freq=0, + low_freq=20, + num_ceps=13, + num_mel_bins=23, + preemphasis_coefficient=0.97, + raw_energy=True, + remove_dc_offset=True, + round_to_power_of_two=True, + sample_frequency=16000, + snip_edges=True, + use_energy=True, + window_type='povey', + dtype=np.float32): + """ Compute mel-frequency cepstral coefficients + + :param waveform: Input waveform. + :param blackman_coeff: Constant coefficient for generalized Blackman window. (float, default = 0.42) + :param cepstral_lifter: Constant that controls scaling of MFCCs (float, default = 22) + :param dither: Dithering constant (0.0 means no dither). If you turn this off, you should set the --energy-floor option, e.g. to 1.0 or 0.1 (float, default = 1) + :param energy_floor: Floor on energy (absolute, not relative) in MFCC computation. Only makes a difference if --use-energy=true; only necessary if --dither=0.0. Suggested values: 0.1 or 1.0 (float, default = 0) + :param frame_length: Frame length in milliseconds (float, default = 25) + :param frame_shift: Frame shift in milliseconds (float, default = 10) + :param high_freq: High cutoff frequency for mel bins (if <= 0, offset from Nyquist) (float, default = 0) + :param low_freq: Low cutoff frequency for mel bins (float, default = 20) + :param num_ceps: Number of cepstra in MFCC computation (including C0) (int, default = 13) + :param num_mel_bins: Number of triangular mel-frequency bins (int, default = 23) + :param preemphasis_coefficient: Coefficient for use in signal preemphasis (float, default = 0.97) + :param raw_energy: If true, compute energy before preemphasis and windowing (bool, default = true) + :param remove_dc_offset: Subtract mean from waveform on each frame (bool, default = true) + :param round_to_power_of_two: If true, round window size to power of two by zero-padding input to FFT. (bool, default = true) + :param sample_frequency: Waveform data sample frequency (must match the waveform file, if specified there) (float, default = 16000) + :param snip_edges: If true, end effects will be handled by outputting only frames that completely fit in the file, and the number of frames depends on the frame-length. If false, the number of frames depends only on the frame-shift, and we reflect the data at the ends. (bool, default = true) + :param use_energy: Use energy (not C0) in MFCC computation (bool, default = true) + :param window_type: Type of window ("hamming"|"hanning"|"povey"|"rectangular"|"sine"|"blackmann") (string, default = "povey") + :param dtype: Type of array (np.float32|np.float64) (dtype or string, default=np.float32) + :return: Mel-frequency cespstral coefficients. + """ + feat, log_energy = compute_fbank_feats( + waveform=waveform, + blackman_coeff=blackman_coeff, + dither=dither, + energy_floor=energy_floor, + frame_length=frame_length, + frame_shift=frame_shift, + high_freq=high_freq, + low_freq=low_freq, + num_mel_bins=num_mel_bins, + preemphasis_coefficient=preemphasis_coefficient, + raw_energy=raw_energy, + remove_dc_offset=remove_dc_offset, + round_to_power_of_two=round_to_power_of_two, + sample_frequency=sample_frequency, + snip_edges=snip_edges, + use_energy=use_energy, + use_log_fbank=True, + use_power=True, + window_type=window_type, + dtype=dtype + ) + feat = dct(feat, type=2, axis=1, norm='ortho')[:, :num_ceps] + lifter_coeffs = compute_lifter_coeffs(cepstral_lifter, num_ceps).astype(dtype) + feat = feat * lifter_coeffs + if use_energy: + feat[:, 0] = log_energy + return feat + +# ---------- compute-mfcc-feats ---------- + + +# ---------- apply-cmvn-sliding ---------- + +def apply_cmvn_sliding(feat, center=False, window=600, min_window=100, norm_vars=False): + """ Apply sliding-window cepstral mean (and optionally variance) normalization + + :param feat: Cepstrum. + :param center: If true, use a window centered on the current frame (to the extent possible, modulo end effects). If false, window is to the left. (bool, default = false) + :param window: Window in frames for running average CMN computation (int, default = 600) + :param min_window: Minimum CMN window used at start of decoding (adds latency only at start). Only applicable if center == false, ignored if center==true (int, default = 100) + :param norm_vars: If true, normalize variance to one. (bool, default = false) + :return: Normalized cepstrum. + """ + # double-precision + feat = apply_cmvn_sliding_internal( + feat=feat.astype(np.float64), + center=center, + window=window, + min_window=min_window, + norm_vars=norm_vars + ).astype(feat.dtype) + return feat + +# ---------- apply-cmvn-sliding ---------- diff --git a/funasr/runtime/python/onnxruntime/rapid_paraformer/kaldifeat/ivector.py b/funasr/runtime/python/onnxruntime/rapid_paraformer/kaldifeat/ivector.py new file mode 100644 index 000000000..5577be1f0 --- /dev/null +++ b/funasr/runtime/python/onnxruntime/rapid_paraformer/kaldifeat/ivector.py @@ -0,0 +1,43 @@ +import numpy as np + +from .feature import sliding_window + + +# ---------- compute-vad ---------- + +def compute_vad(log_energy, energy_mean_scale=0.5, energy_threshold=0.5, frames_context=0, proportion_threshold=0.6): + """ Apply voice activity detection + + :param log_energy: Log mel energy. + :param energy_mean_scale: If this is set to s, to get the actual threshold we let m be the mean log-energy of the file, and use s*m + vad-energy-threshold (float, default = 0.5) + :param energy_threshold: Constant term in energy threshold for VAD (also see energy_mean_scale) (float, default = 5) + :param frames_context: Number of frames of context on each side of central frame, in window for which energy is monitored (int, default = 0) + :param proportion_threshold: Parameter controlling the proportion of frames within the window that need to have more energy than the threshold (float, default = 0.6) + :return: A vector of boolean that are True if we judge the frame voiced and False otherwise. + """ + assert len(log_energy.shape) == 1 + assert energy_mean_scale >= 0 + assert frames_context >= 0 + assert 0 < proportion_threshold < 1 + dtype = log_energy.dtype + energy_threshold += energy_mean_scale * log_energy.mean() + if frames_context > 0: + num_frames = len(log_energy) + window_size = frames_context * 2 + 1 + log_energy_pad = np.concatenate([ + np.zeros(frames_context, dtype=dtype), + log_energy, + np.zeros(frames_context, dtype=dtype) + ]) + log_energy_window = sliding_window(log_energy_pad, window_size, 1) + num_count = np.count_nonzero(log_energy_window > energy_threshold, axis=1) + den_count = np.ones(num_frames, dtype=dtype) * window_size + max_den_count = np.arange(frames_context + 1, min(window_size, num_frames) + 1, dtype=dtype) + den_count[:-(frames_context + 2):-1] = max_den_count + den_count[:frames_context + 1] = np.min([den_count[:frames_context + 1], max_den_count], axis=0) + vad = num_count / den_count >= proportion_threshold + else: + vad = log_energy > energy_threshold + return vad + +# ---------- compute-vad ---------- diff --git a/funasr/runtime/python/onnxruntime/rapid_paraformer/rapid_paraformer.py b/funasr/runtime/python/onnxruntime/rapid_paraformer/rapid_paraformer.py new file mode 100644 index 000000000..262c2581c --- /dev/null +++ b/funasr/runtime/python/onnxruntime/rapid_paraformer/rapid_paraformer.py @@ -0,0 +1,139 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +import traceback +from pathlib import Path +from typing import List, Union, Tuple + +import librosa +import numpy as np + +from .utils import (CharTokenizer, Hypothesis, ONNXRuntimeError, + OrtInferSession, TokenIDConverter, WavFrontend, get_logger, + read_yaml) + +logging = get_logger() + + +class RapidParaformer(): + def __init__(self, config_path: Union[str, Path]) -> None: + if not Path(config_path).exists(): + raise FileNotFoundError(f'{config_path} does not exist.') + + config = read_yaml(config_path) + + self.converter = TokenIDConverter(**config['TokenIDConverter']) + self.tokenizer = CharTokenizer(**config['CharTokenizer']) + self.frontend = WavFrontend( + cmvn_file=config['WavFrontend']['cmvn_file'], + **config['WavFrontend']['frontend_conf'] + ) + self.ort_infer = OrtInferSession(config['Model']) + self.batch_size = config['Model']['batch_size'] + + def __call__(self, wav_content: Union[str, np.ndarray, List[str]]) -> List: + waveform_list = self.load_data(wav_content) + waveform_nums = len(waveform_list) + + asr_res = [] + for beg_idx in range(0, waveform_nums, self.batch_size): + end_idx = min(waveform_nums, beg_idx + self.batch_size) + + feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx]) + + try: + am_scores, valid_token_lens = self.infer(feats, feats_len) + except ONNXRuntimeError: + logging.error(traceback.format_exc()) + preds = [] + else: + preds = self.decode(am_scores, valid_token_lens) + + asr_res.extend(preds) + return asr_res + + def load_data(self, + wav_content: Union[str, np.ndarray, List[str]]) -> List: + def load_wav(path: str) -> np.ndarray: + waveform, _ = librosa.load(path) + return waveform[None, ...] + + if isinstance(wav_content, np.ndarray): + return [wav_content] + + if isinstance(wav_content, str): + return [load_wav(wav_content)] + + if isinstance(wav_content, list): + return [load_wav(path) for path in wav_content] + + raise TypeError( + f'The type of {wav_content} is not in [str, np.ndarray, list]') + + def extract_feat(self, + waveform_list: List[np.ndarray] + ) -> Tuple[np.ndarray, np.ndarray]: + feats, feats_len = [], [] + for waveform in waveform_list: + speech, _ = self.frontend.fbank(waveform) + feat, feat_len = self.frontend.lfr_cmvn(speech) + feats.append(feat) + feats_len.append(feat_len) + + feats = self.pad_feats(feats, np.max(feats_len)) + feats_len = np.array(feats_len).astype(np.int32) + return feats, feats_len + + @staticmethod + def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray: + def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray: + pad_width = ((0, max_feat_len - cur_len), (0, 0)) + return np.pad(feat, pad_width, 'constant', constant_values=0) + + feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats] + feats = np.array(feat_res).astype(np.float32) + return feats + + def infer(self, feats: np.ndarray, + feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: + am_scores, token_nums = self.ort_infer([feats, feats_len]) + return am_scores, token_nums + + def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]: + return [self.decode_one(am_score, token_num) + for am_score, token_num in zip(am_scores, token_nums)] + + def decode_one(self, + am_score: np.ndarray, + valid_token_num: int) -> List[str]: + yseq = am_score.argmax(axis=-1) + score = am_score.max(axis=-1) + score = np.sum(score, axis=-1) + + # pad with mask tokens to ensure compatibility with sos/eos tokens + # asr_model.sos:1 asr_model.eos:2 + yseq = np.array([1] + yseq.tolist() + [2]) + hyp = Hypothesis(yseq=yseq, score=score) + + # remove sos/eos and get results + last_pos = -1 + token_int = hyp.yseq[1:last_pos].tolist() + + # remove blank symbol id, which is assumed to be 0 + token_int = list(filter(lambda x: x not in (0, 2), token_int)) + + # Change integer-ids to tokens + token = self.converter.ids2tokens(token_int) + text = self.tokenizer.tokens2text(token) + return text[:valid_token_num-1] + + +if __name__ == '__main__': + project_dir = Path(__file__).resolve().parent.parent + cfg_path = project_dir / 'resources' / 'config.yaml' + paraformer = RapidParaformer(cfg_path) + + wav_file = '0478_00017.wav' + for i in range(1000): + result = paraformer(wav_file) + print(result) diff --git a/funasr/runtime/python/onnxruntime/rapid_paraformer/utils.py b/funasr/runtime/python/onnxruntime/rapid_paraformer/utils.py new file mode 100644 index 000000000..adab79084 --- /dev/null +++ b/funasr/runtime/python/onnxruntime/rapid_paraformer/utils.py @@ -0,0 +1,371 @@ +# -*- encoding: utf-8 -*- +# @Author: SWHL +# @Contact: liekkaskono@163.com +import functools +import logging +import pickle +from pathlib import Path +from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union + +import numpy as np +import yaml +from onnxruntime import (GraphOptimizationLevel, InferenceSession, + SessionOptions, get_available_providers, get_device) +from typeguard import check_argument_types + +from .kaldifeat import compute_fbank_feats + +root_dir = Path(__file__).resolve().parent + +logger_initialized = {} + + +class TokenIDConverter(): + def __init__(self, token_path: Union[Path, str], + unk_symbol: str = "