From b7b65c844d6d7b88b76270f0c29841c6ea321175 Mon Sep 17 00:00:00 2001 From: speech_asr Date: Mon, 13 Mar 2023 15:33:23 +0800 Subject: [PATCH] update ola --- funasr/models/frontend/eend_ola_feature.py | 51 ++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 funasr/models/frontend/eend_ola_feature.py diff --git a/funasr/models/frontend/eend_ola_feature.py b/funasr/models/frontend/eend_ola_feature.py new file mode 100644 index 000000000..e15b71c25 --- /dev/null +++ b/funasr/models/frontend/eend_ola_feature.py @@ -0,0 +1,51 @@ +# Copyright 2019 Hitachi, Ltd. (author: Yusuke Fujita) +# Licensed under the MIT license. +# +# This module is for computing audio features + +import librosa +import numpy as np + + +def transform(Y, dtype=np.float32): + Y = np.abs(Y) + n_fft = 2 * (Y.shape[1] - 1) + sr = 8000 + n_mels = 23 + mel_basis = librosa.filters.mel(sr, n_fft, n_mels) + Y = np.dot(Y ** 2, mel_basis.T) + Y = np.log10(np.maximum(Y, 1e-10)) + mean = np.mean(Y, axis=0) + Y = Y - mean + return Y.astype(dtype) + + +def subsample(Y, T, subsampling=1): + Y_ss = Y[::subsampling] + T_ss = T[::subsampling] + return Y_ss, T_ss + + +def splice(Y, context_size=0): + Y_pad = np.pad( + Y, + [(context_size, context_size), (0, 0)], + 'constant') + Y_spliced = np.lib.stride_tricks.as_strided( + np.ascontiguousarray(Y_pad), + (Y.shape[0], Y.shape[1] * (2 * context_size + 1)), + (Y.itemsize * Y.shape[1], Y.itemsize), writeable=False) + return Y_spliced + + +def stft( + data, + frame_size=1024, + frame_shift=256): + fft_size = 1 << (frame_size - 1).bit_length() + if len(data) % frame_shift == 0: + return librosa.stft(data, n_fft=fft_size, win_length=frame_size, + hop_length=frame_shift).T[:-1] + else: + return librosa.stft(data, n_fft=fft_size, win_length=frame_size, + hop_length=frame_shift).T \ No newline at end of file