diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/__init__.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/__init__.py deleted file mode 100644 index 475047903..000000000 --- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -# -*- encoding: utf-8 -*- -from .paraformer_bin import Paraformer -from .vad_bin import Fsmn_vad diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/paraformer_bin.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/paraformer_bin.py deleted file mode 100644 index cbdb8d9e6..000000000 --- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/paraformer_bin.py +++ /dev/null @@ -1,187 +0,0 @@ -# -*- encoding: utf-8 -*- - -import os.path -from pathlib import Path -from typing import List, Union, Tuple - -import copy -import librosa -import numpy as np - -from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError, - OrtInferSession, TokenIDConverter, get_logger, - read_yaml) -from .utils.postprocess_utils import sentence_postprocess -from .utils.frontend import WavFrontend -from .utils.timestamp_utils import time_stamp_lfr6_onnx - -logging = get_logger() - - -class Paraformer(): - def __init__(self, model_dir: Union[str, Path] = None, - batch_size: int = 1, - device_id: Union[str, int] = "-1", - plot_timestamp_to: str = "", - pred_bias: int = 1, - quantize: bool = False, - intra_op_num_threads: int = 4, - ): - - if not Path(model_dir).exists(): - raise FileNotFoundError(f'{model_dir} does not exist.') - - model_file = os.path.join(model_dir, 'model.onnx') - if quantize: - model_file = os.path.join(model_dir, 'model_quant.onnx') - config_file = os.path.join(model_dir, 'config.yaml') - cmvn_file = os.path.join(model_dir, 'am.mvn') - config = read_yaml(config_file) - - self.converter = TokenIDConverter(config['token_list']) - self.tokenizer = CharTokenizer() - self.frontend = WavFrontend( - cmvn_file=cmvn_file, - **config['frontend_conf'] - ) - self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads) - self.batch_size = batch_size - self.plot_timestamp_to = plot_timestamp_to - self.pred_bias = pred_bias - - def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List: - waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq) - waveform_nums = len(waveform_list) - asr_res = [] - for beg_idx in range(0, waveform_nums, self.batch_size): - - end_idx = min(waveform_nums, beg_idx + self.batch_size) - feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx]) - try: - outputs = self.infer(feats, feats_len) - am_scores, valid_token_lens = outputs[0], outputs[1] - if len(outputs) == 4: - # for BiCifParaformer Inference - us_alphas, us_peaks = outputs[2], outputs[3] - else: - us_alphas, us_peaks = None, None - except ONNXRuntimeError: - #logging.warning(traceback.format_exc()) - logging.warning("input wav is silence or noise") - preds = [''] - else: - preds = self.decode(am_scores, valid_token_lens) - if us_peaks is None: - for pred in preds: - pred = sentence_postprocess(pred) - asr_res.append({'preds': pred}) - else: - for pred, us_peaks_ in zip(preds, us_peaks): - raw_tokens = pred - timestamp, timestamp_raw = time_stamp_lfr6_onnx(us_peaks_, copy.copy(raw_tokens)) - text_proc, timestamp_proc, _ = sentence_postprocess(raw_tokens, timestamp_raw) - # logging.warning(timestamp) - if len(self.plot_timestamp_to): - self.plot_wave_timestamp(waveform_list[0], timestamp, self.plot_timestamp_to) - asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens}) - return asr_res - - def plot_wave_timestamp(self, wav, text_timestamp, dest): - # TODO: Plot the wav and timestamp results with matplotlib - import matplotlib - matplotlib.use('Agg') - matplotlib.rc("font", family='Alibaba PuHuiTi') # set it to a font that your system supports - import matplotlib.pyplot as plt - fig, ax1 = plt.subplots(figsize=(11, 3.5), dpi=320) - ax2 = ax1.twinx() - ax2.set_ylim([0, 2.0]) - # plot waveform - ax1.set_ylim([-0.3, 0.3]) - time = np.arange(wav.shape[0]) / 16000 - ax1.plot(time, wav/wav.max()*0.3, color='gray', alpha=0.4) - # plot lines and text - for (char, start, end) in text_timestamp: - ax1.vlines(start, -0.3, 0.3, ls='--') - ax1.vlines(end, -0.3, 0.3, ls='--') - x_adj = 0.045 if char != '' else 0.12 - ax1.text((start + end) * 0.5 - x_adj, 0, char) - # plt.legend() - plotname = "{}/timestamp.png".format(dest) - plt.savefig(plotname, bbox_inches='tight') - - def load_data(self, - wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List: - def load_wav(path: str) -> np.ndarray: - waveform, _ = librosa.load(path, sr=fs) - return waveform - - if isinstance(wav_content, np.ndarray): - return [wav_content] - - if isinstance(wav_content, str): - return [load_wav(wav_content)] - - if isinstance(wav_content, list): - return [load_wav(path) for path in wav_content] - - raise TypeError( - f'The type of {wav_content} is not in [str, np.ndarray, list]') - - def extract_feat(self, - waveform_list: List[np.ndarray] - ) -> Tuple[np.ndarray, np.ndarray]: - feats, feats_len = [], [] - for waveform in waveform_list: - speech, _ = self.frontend.fbank(waveform) - feat, feat_len = self.frontend.lfr_cmvn(speech) - feats.append(feat) - feats_len.append(feat_len) - - feats = self.pad_feats(feats, np.max(feats_len)) - feats_len = np.array(feats_len).astype(np.int32) - return feats, feats_len - - @staticmethod - def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray: - def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray: - pad_width = ((0, max_feat_len - cur_len), (0, 0)) - return np.pad(feat, pad_width, 'constant', constant_values=0) - - feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats] - feats = np.array(feat_res).astype(np.float32) - return feats - - def infer(self, feats: np.ndarray, - feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: - outputs = self.ort_infer([feats, feats_len]) - return outputs - - def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]: - return [self.decode_one(am_score, token_num) - for am_score, token_num in zip(am_scores, token_nums)] - - def decode_one(self, - am_score: np.ndarray, - valid_token_num: int) -> List[str]: - yseq = am_score.argmax(axis=-1) - score = am_score.max(axis=-1) - score = np.sum(score, axis=-1) - - # pad with mask tokens to ensure compatibility with sos/eos tokens - # asr_model.sos:1 asr_model.eos:2 - yseq = np.array([1] + yseq.tolist() + [2]) - hyp = Hypothesis(yseq=yseq, score=score) - - # remove sos/eos and get results - last_pos = -1 - token_int = hyp.yseq[1:last_pos].tolist() - - # remove blank symbol id, which is assumed to be 0 - token_int = list(filter(lambda x: x not in (0, 2), token_int)) - - # Change integer-ids to tokens - token = self.converter.ids2tokens(token_int) - token = token[:valid_token_num-self.pred_bias] - # texts = sentence_postprocess(token) - return token - diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/punc_bin.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/punc_bin.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/__init__.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/e2e_vad.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/e2e_vad.py deleted file mode 100644 index 8eed22fa4..000000000 --- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/e2e_vad.py +++ /dev/null @@ -1,607 +0,0 @@ -from enum import Enum -from typing import List, Tuple, Dict, Any - -import math -import numpy as np - -class VadStateMachine(Enum): - kVadInStateStartPointNotDetected = 1 - kVadInStateInSpeechSegment = 2 - kVadInStateEndPointDetected = 3 - - -class FrameState(Enum): - kFrameStateInvalid = -1 - kFrameStateSpeech = 1 - kFrameStateSil = 0 - - -# final voice/unvoice state per frame -class AudioChangeState(Enum): - kChangeStateSpeech2Speech = 0 - kChangeStateSpeech2Sil = 1 - kChangeStateSil2Sil = 2 - kChangeStateSil2Speech = 3 - kChangeStateNoBegin = 4 - kChangeStateInvalid = 5 - - -class VadDetectMode(Enum): - kVadSingleUtteranceDetectMode = 0 - kVadMutipleUtteranceDetectMode = 1 - - -class VADXOptions: - def __init__( - self, - sample_rate: int = 16000, - detect_mode: int = VadDetectMode.kVadMutipleUtteranceDetectMode.value, - snr_mode: int = 0, - max_end_silence_time: int = 800, - max_start_silence_time: int = 3000, - do_start_point_detection: bool = True, - do_end_point_detection: bool = True, - window_size_ms: int = 200, - sil_to_speech_time_thres: int = 150, - speech_to_sil_time_thres: int = 150, - speech_2_noise_ratio: float = 1.0, - do_extend: int = 1, - lookback_time_start_point: int = 200, - lookahead_time_end_point: int = 100, - max_single_segment_time: int = 60000, - nn_eval_block_size: int = 8, - dcd_block_size: int = 4, - snr_thres: int = -100.0, - noise_frame_num_used_for_snr: int = 100, - decibel_thres: int = -100.0, - speech_noise_thres: float = 0.6, - fe_prior_thres: float = 1e-4, - silence_pdf_num: int = 1, - sil_pdf_ids: List[int] = [0], - speech_noise_thresh_low: float = -0.1, - speech_noise_thresh_high: float = 0.3, - output_frame_probs: bool = False, - frame_in_ms: int = 10, - frame_length_ms: int = 25, - ): - self.sample_rate = sample_rate - self.detect_mode = detect_mode - self.snr_mode = snr_mode - self.max_end_silence_time = max_end_silence_time - self.max_start_silence_time = max_start_silence_time - self.do_start_point_detection = do_start_point_detection - self.do_end_point_detection = do_end_point_detection - self.window_size_ms = window_size_ms - self.sil_to_speech_time_thres = sil_to_speech_time_thres - self.speech_to_sil_time_thres = speech_to_sil_time_thres - self.speech_2_noise_ratio = speech_2_noise_ratio - self.do_extend = do_extend - self.lookback_time_start_point = lookback_time_start_point - self.lookahead_time_end_point = lookahead_time_end_point - self.max_single_segment_time = max_single_segment_time - self.nn_eval_block_size = nn_eval_block_size - self.dcd_block_size = dcd_block_size - self.snr_thres = snr_thres - self.noise_frame_num_used_for_snr = noise_frame_num_used_for_snr - self.decibel_thres = decibel_thres - self.speech_noise_thres = speech_noise_thres - self.fe_prior_thres = fe_prior_thres - self.silence_pdf_num = silence_pdf_num - self.sil_pdf_ids = sil_pdf_ids - self.speech_noise_thresh_low = speech_noise_thresh_low - self.speech_noise_thresh_high = speech_noise_thresh_high - self.output_frame_probs = output_frame_probs - self.frame_in_ms = frame_in_ms - self.frame_length_ms = frame_length_ms - - -class E2EVadSpeechBufWithDoa(object): - def __init__(self): - self.start_ms = 0 - self.end_ms = 0 - self.buffer = [] - self.contain_seg_start_point = False - self.contain_seg_end_point = False - self.doa = 0 - - def Reset(self): - self.start_ms = 0 - self.end_ms = 0 - self.buffer = [] - self.contain_seg_start_point = False - self.contain_seg_end_point = False - self.doa = 0 - - -class E2EVadFrameProb(object): - def __init__(self): - self.noise_prob = 0.0 - self.speech_prob = 0.0 - self.score = 0.0 - self.frame_id = 0 - self.frm_state = 0 - - -class WindowDetector(object): - def __init__(self, window_size_ms: int, sil_to_speech_time: int, - speech_to_sil_time: int, frame_size_ms: int): - self.window_size_ms = window_size_ms - self.sil_to_speech_time = sil_to_speech_time - self.speech_to_sil_time = speech_to_sil_time - self.frame_size_ms = frame_size_ms - - self.win_size_frame = int(window_size_ms / frame_size_ms) - self.win_sum = 0 - self.win_state = [0] * self.win_size_frame # 初始化窗 - - self.cur_win_pos = 0 - self.pre_frame_state = FrameState.kFrameStateSil - self.cur_frame_state = FrameState.kFrameStateSil - self.sil_to_speech_frmcnt_thres = int(sil_to_speech_time / frame_size_ms) - self.speech_to_sil_frmcnt_thres = int(speech_to_sil_time / frame_size_ms) - - self.voice_last_frame_count = 0 - self.noise_last_frame_count = 0 - self.hydre_frame_count = 0 - - def Reset(self) -> None: - self.cur_win_pos = 0 - self.win_sum = 0 - self.win_state = [0] * self.win_size_frame - self.pre_frame_state = FrameState.kFrameStateSil - self.cur_frame_state = FrameState.kFrameStateSil - self.voice_last_frame_count = 0 - self.noise_last_frame_count = 0 - self.hydre_frame_count = 0 - - def GetWinSize(self) -> int: - return int(self.win_size_frame) - - def DetectOneFrame(self, frameState: FrameState, frame_count: int) -> AudioChangeState: - cur_frame_state = FrameState.kFrameStateSil - if frameState == FrameState.kFrameStateSpeech: - cur_frame_state = 1 - elif frameState == FrameState.kFrameStateSil: - cur_frame_state = 0 - else: - return AudioChangeState.kChangeStateInvalid - self.win_sum -= self.win_state[self.cur_win_pos] - self.win_sum += cur_frame_state - self.win_state[self.cur_win_pos] = cur_frame_state - self.cur_win_pos = (self.cur_win_pos + 1) % self.win_size_frame - - if self.pre_frame_state == FrameState.kFrameStateSil and self.win_sum >= self.sil_to_speech_frmcnt_thres: - self.pre_frame_state = FrameState.kFrameStateSpeech - return AudioChangeState.kChangeStateSil2Speech - - if self.pre_frame_state == FrameState.kFrameStateSpeech and self.win_sum <= self.speech_to_sil_frmcnt_thres: - self.pre_frame_state = FrameState.kFrameStateSil - return AudioChangeState.kChangeStateSpeech2Sil - - if self.pre_frame_state == FrameState.kFrameStateSil: - return AudioChangeState.kChangeStateSil2Sil - if self.pre_frame_state == FrameState.kFrameStateSpeech: - return AudioChangeState.kChangeStateSpeech2Speech - return AudioChangeState.kChangeStateInvalid - - def FrameSizeMs(self) -> int: - return int(self.frame_size_ms) - - -class E2EVadModel(): - def __init__(self, vad_post_args: Dict[str, Any]): - super(E2EVadModel, self).__init__() - self.vad_opts = VADXOptions(**vad_post_args) - self.windows_detector = WindowDetector(self.vad_opts.window_size_ms, - self.vad_opts.sil_to_speech_time_thres, - self.vad_opts.speech_to_sil_time_thres, - self.vad_opts.frame_in_ms) - # self.encoder = encoder - # init variables - self.is_final = False - self.data_buf_start_frame = 0 - self.frm_cnt = 0 - self.latest_confirmed_speech_frame = 0 - self.lastest_confirmed_silence_frame = -1 - self.continous_silence_frame_count = 0 - self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected - self.confirmed_start_frame = -1 - self.confirmed_end_frame = -1 - self.number_end_time_detected = 0 - self.sil_frame = 0 - self.sil_pdf_ids = self.vad_opts.sil_pdf_ids - self.noise_average_decibel = -100.0 - self.pre_end_silence_detected = False - self.next_seg = True - - self.output_data_buf = [] - self.output_data_buf_offset = 0 - self.frame_probs = [] - self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres - self.speech_noise_thres = self.vad_opts.speech_noise_thres - self.scores = None - self.max_time_out = False - self.decibel = [] - self.data_buf = None - self.data_buf_all = None - self.waveform = None - self.ResetDetection() - - def AllResetDetection(self): - self.is_final = False - self.data_buf_start_frame = 0 - self.frm_cnt = 0 - self.latest_confirmed_speech_frame = 0 - self.lastest_confirmed_silence_frame = -1 - self.continous_silence_frame_count = 0 - self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected - self.confirmed_start_frame = -1 - self.confirmed_end_frame = -1 - self.number_end_time_detected = 0 - self.sil_frame = 0 - self.sil_pdf_ids = self.vad_opts.sil_pdf_ids - self.noise_average_decibel = -100.0 - self.pre_end_silence_detected = False - self.next_seg = True - - self.output_data_buf = [] - self.output_data_buf_offset = 0 - self.frame_probs = [] - self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres - self.speech_noise_thres = self.vad_opts.speech_noise_thres - self.scores = None - self.max_time_out = False - self.decibel = [] - self.data_buf = None - self.data_buf_all = None - self.waveform = None - self.ResetDetection() - - def ResetDetection(self): - self.continous_silence_frame_count = 0 - self.latest_confirmed_speech_frame = 0 - self.lastest_confirmed_silence_frame = -1 - self.confirmed_start_frame = -1 - self.confirmed_end_frame = -1 - self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected - self.windows_detector.Reset() - self.sil_frame = 0 - self.frame_probs = [] - - def ComputeDecibel(self) -> None: - frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000) - frame_shift_length = int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000) - if self.data_buf_all is None: - self.data_buf_all = self.waveform[0] # self.data_buf is pointed to self.waveform[0] - self.data_buf = self.data_buf_all - else: - self.data_buf_all = np.concatenate((self.data_buf_all, self.waveform[0])) - for offset in range(0, self.waveform.shape[1] - frame_sample_length + 1, frame_shift_length): - self.decibel.append( - 10 * math.log10((self.waveform[0][offset: offset + frame_sample_length]).square().sum() + \ - 0.000001)) - - def ComputeScores(self, scores: np.ndarray) -> None: - # scores = self.encoder(feats, in_cache) # return B * T * D - self.vad_opts.nn_eval_block_size = scores.shape[1] - self.frm_cnt += scores.shape[1] # count total frames - if self.scores is None: - self.scores = scores # the first calculation - else: - self.scores = np.concatenate((self.scores, scores), axis=1) - - def PopDataBufTillFrame(self, frame_idx: int) -> None: # need check again - while self.data_buf_start_frame < frame_idx: - if len(self.data_buf) >= int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000): - self.data_buf_start_frame += 1 - self.data_buf = self.data_buf_all[self.data_buf_start_frame * int( - self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):] - - def PopDataToOutputBuf(self, start_frm: int, frm_cnt: int, first_frm_is_start_point: bool, - last_frm_is_end_point: bool, end_point_is_sent_end: bool) -> None: - self.PopDataBufTillFrame(start_frm) - expected_sample_number = int(frm_cnt * self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000) - if last_frm_is_end_point: - extra_sample = max(0, int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000 - \ - self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000)) - expected_sample_number += int(extra_sample) - if end_point_is_sent_end: - expected_sample_number = max(expected_sample_number, len(self.data_buf)) - if len(self.data_buf) < expected_sample_number: - print('error in calling pop data_buf\n') - - if len(self.output_data_buf) == 0 or first_frm_is_start_point: - self.output_data_buf.append(E2EVadSpeechBufWithDoa()) - self.output_data_buf[-1].Reset() - self.output_data_buf[-1].start_ms = start_frm * self.vad_opts.frame_in_ms - self.output_data_buf[-1].end_ms = self.output_data_buf[-1].start_ms - self.output_data_buf[-1].doa = 0 - cur_seg = self.output_data_buf[-1] - if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms: - print('warning\n') - out_pos = len(cur_seg.buffer) # cur_seg.buff现在没做任何操作 - data_to_pop = 0 - if end_point_is_sent_end: - data_to_pop = expected_sample_number - else: - data_to_pop = int(frm_cnt * self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000) - if data_to_pop > len(self.data_buf): - print('VAD data_to_pop is bigger than self.data_buf.size()!!!\n') - data_to_pop = len(self.data_buf) - expected_sample_number = len(self.data_buf) - - cur_seg.doa = 0 - for sample_cpy_out in range(0, data_to_pop): - # cur_seg.buffer[out_pos ++] = data_buf_.back(); - out_pos += 1 - for sample_cpy_out in range(data_to_pop, expected_sample_number): - # cur_seg.buffer[out_pos++] = data_buf_.back() - out_pos += 1 - if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms: - print('Something wrong with the VAD algorithm\n') - self.data_buf_start_frame += frm_cnt - cur_seg.end_ms = (start_frm + frm_cnt) * self.vad_opts.frame_in_ms - if first_frm_is_start_point: - cur_seg.contain_seg_start_point = True - if last_frm_is_end_point: - cur_seg.contain_seg_end_point = True - - def OnSilenceDetected(self, valid_frame: int): - self.lastest_confirmed_silence_frame = valid_frame - if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected: - self.PopDataBufTillFrame(valid_frame) - # silence_detected_callback_ - # pass - - def OnVoiceDetected(self, valid_frame: int) -> None: - self.latest_confirmed_speech_frame = valid_frame - self.PopDataToOutputBuf(valid_frame, 1, False, False, False) - - def OnVoiceStart(self, start_frame: int, fake_result: bool = False) -> None: - if self.vad_opts.do_start_point_detection: - pass - if self.confirmed_start_frame != -1: - print('not reset vad properly\n') - else: - self.confirmed_start_frame = start_frame - - if not fake_result and self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected: - self.PopDataToOutputBuf(self.confirmed_start_frame, 1, True, False, False) - - def OnVoiceEnd(self, end_frame: int, fake_result: bool, is_last_frame: bool) -> None: - for t in range(self.latest_confirmed_speech_frame + 1, end_frame): - self.OnVoiceDetected(t) - if self.vad_opts.do_end_point_detection: - pass - if self.confirmed_end_frame != -1: - print('not reset vad properly\n') - else: - self.confirmed_end_frame = end_frame - if not fake_result: - self.sil_frame = 0 - self.PopDataToOutputBuf(self.confirmed_end_frame, 1, False, True, is_last_frame) - self.number_end_time_detected += 1 - - def MaybeOnVoiceEndIfLastFrame(self, is_final_frame: bool, cur_frm_idx: int) -> None: - if is_final_frame: - self.OnVoiceEnd(cur_frm_idx, False, True) - self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected - - def GetLatency(self) -> int: - return int(self.LatencyFrmNumAtStartPoint() * self.vad_opts.frame_in_ms) - - def LatencyFrmNumAtStartPoint(self) -> int: - vad_latency = self.windows_detector.GetWinSize() - if self.vad_opts.do_extend: - vad_latency += int(self.vad_opts.lookback_time_start_point / self.vad_opts.frame_in_ms) - return vad_latency - - def GetFrameState(self, t: int) -> FrameState: - frame_state = FrameState.kFrameStateInvalid - cur_decibel = self.decibel[t] - cur_snr = cur_decibel - self.noise_average_decibel - # for each frame, calc log posterior probability of each state - if cur_decibel < self.vad_opts.decibel_thres: - frame_state = FrameState.kFrameStateSil - self.DetectOneFrame(frame_state, t, False) - return frame_state - - sum_score = 0.0 - noise_prob = 0.0 - assert len(self.sil_pdf_ids) == self.vad_opts.silence_pdf_num - if len(self.sil_pdf_ids) > 0: - assert len(self.scores) == 1 # 只支持batch_size = 1的测试 - sil_pdf_scores = [self.scores[0][t][sil_pdf_id] for sil_pdf_id in self.sil_pdf_ids] - sum_score = sum(sil_pdf_scores) - noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio - total_score = 1.0 - sum_score = total_score - sum_score - speech_prob = math.log(sum_score) - if self.vad_opts.output_frame_probs: - frame_prob = E2EVadFrameProb() - frame_prob.noise_prob = noise_prob - frame_prob.speech_prob = speech_prob - frame_prob.score = sum_score - frame_prob.frame_id = t - self.frame_probs.append(frame_prob) - if math.exp(speech_prob) >= math.exp(noise_prob) + self.speech_noise_thres: - if cur_snr >= self.vad_opts.snr_thres and cur_decibel >= self.vad_opts.decibel_thres: - frame_state = FrameState.kFrameStateSpeech - else: - frame_state = FrameState.kFrameStateSil - else: - frame_state = FrameState.kFrameStateSil - if self.noise_average_decibel < -99.9: - self.noise_average_decibel = cur_decibel - else: - self.noise_average_decibel = (cur_decibel + self.noise_average_decibel * ( - self.vad_opts.noise_frame_num_used_for_snr - - 1)) / self.vad_opts.noise_frame_num_used_for_snr - - return frame_state - - - def __call__(self, score: np.ndarray, waveform: np.ndarray, - is_final: bool = False, max_end_sil: int = 800 - ): - self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres - self.waveform = waveform # compute decibel for each frame - self.ComputeDecibel() - self.ComputeScores(score) - if not is_final: - self.DetectCommonFrames() - else: - self.DetectLastFrames() - segments = [] - for batch_num in range(0, score.shape[0]): # only support batch_size = 1 now - segment_batch = [] - if len(self.output_data_buf) > 0: - for i in range(self.output_data_buf_offset, len(self.output_data_buf)): - if not self.output_data_buf[i].contain_seg_start_point: - continue - if not self.next_seg and not self.output_data_buf[i].contain_seg_end_point: - continue - start_ms = self.output_data_buf[i].start_ms if self.next_seg else -1 - if self.output_data_buf[i].contain_seg_end_point: - end_ms = self.output_data_buf[i].end_ms - self.next_seg = True - self.output_data_buf_offset += 1 - else: - end_ms = -1 - self.next_seg = False - segment = [start_ms, end_ms] - segment_batch.append(segment) - if segment_batch: - segments.append(segment_batch) - if is_final: - # reset class variables and clear the dict for the next query - self.AllResetDetection() - return segments - - def DetectCommonFrames(self) -> int: - if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected: - return 0 - for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1): - frame_state = FrameState.kFrameStateInvalid - frame_state = self.GetFrameState(self.frm_cnt - 1 - i) - self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False) - - return 0 - - def DetectLastFrames(self) -> int: - if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected: - return 0 - for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1): - frame_state = FrameState.kFrameStateInvalid - frame_state = self.GetFrameState(self.frm_cnt - 1 - i) - if i != 0: - self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False) - else: - self.DetectOneFrame(frame_state, self.frm_cnt - 1, True) - - return 0 - - def DetectOneFrame(self, cur_frm_state: FrameState, cur_frm_idx: int, is_final_frame: bool) -> None: - tmp_cur_frm_state = FrameState.kFrameStateInvalid - if cur_frm_state == FrameState.kFrameStateSpeech: - if math.fabs(1.0) > self.vad_opts.fe_prior_thres: - tmp_cur_frm_state = FrameState.kFrameStateSpeech - else: - tmp_cur_frm_state = FrameState.kFrameStateSil - elif cur_frm_state == FrameState.kFrameStateSil: - tmp_cur_frm_state = FrameState.kFrameStateSil - state_change = self.windows_detector.DetectOneFrame(tmp_cur_frm_state, cur_frm_idx) - frm_shift_in_ms = self.vad_opts.frame_in_ms - if AudioChangeState.kChangeStateSil2Speech == state_change: - silence_frame_count = self.continous_silence_frame_count - self.continous_silence_frame_count = 0 - self.pre_end_silence_detected = False - start_frame = 0 - if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected: - start_frame = max(self.data_buf_start_frame, cur_frm_idx - self.LatencyFrmNumAtStartPoint()) - self.OnVoiceStart(start_frame) - self.vad_state_machine = VadStateMachine.kVadInStateInSpeechSegment - for t in range(start_frame + 1, cur_frm_idx + 1): - self.OnVoiceDetected(t) - elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment: - for t in range(self.latest_confirmed_speech_frame + 1, cur_frm_idx): - self.OnVoiceDetected(t) - if cur_frm_idx - self.confirmed_start_frame + 1 > \ - self.vad_opts.max_single_segment_time / frm_shift_in_ms: - self.OnVoiceEnd(cur_frm_idx, False, False) - self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected - elif not is_final_frame: - self.OnVoiceDetected(cur_frm_idx) - else: - self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx) - else: - pass - elif AudioChangeState.kChangeStateSpeech2Sil == state_change: - self.continous_silence_frame_count = 0 - if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected: - pass - elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment: - if cur_frm_idx - self.confirmed_start_frame + 1 > \ - self.vad_opts.max_single_segment_time / frm_shift_in_ms: - self.OnVoiceEnd(cur_frm_idx, False, False) - self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected - elif not is_final_frame: - self.OnVoiceDetected(cur_frm_idx) - else: - self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx) - else: - pass - elif AudioChangeState.kChangeStateSpeech2Speech == state_change: - self.continous_silence_frame_count = 0 - if self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment: - if cur_frm_idx - self.confirmed_start_frame + 1 > \ - self.vad_opts.max_single_segment_time / frm_shift_in_ms: - self.max_time_out = True - self.OnVoiceEnd(cur_frm_idx, False, False) - self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected - elif not is_final_frame: - self.OnVoiceDetected(cur_frm_idx) - else: - self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx) - else: - pass - elif AudioChangeState.kChangeStateSil2Sil == state_change: - self.continous_silence_frame_count += 1 - if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected: - # silence timeout, return zero length decision - if ((self.vad_opts.detect_mode == VadDetectMode.kVadSingleUtteranceDetectMode.value) and ( - self.continous_silence_frame_count * frm_shift_in_ms > self.vad_opts.max_start_silence_time)) \ - or (is_final_frame and self.number_end_time_detected == 0): - for t in range(self.lastest_confirmed_silence_frame + 1, cur_frm_idx): - self.OnSilenceDetected(t) - self.OnVoiceStart(0, True) - self.OnVoiceEnd(0, True, False); - self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected - else: - if cur_frm_idx >= self.LatencyFrmNumAtStartPoint(): - self.OnSilenceDetected(cur_frm_idx - self.LatencyFrmNumAtStartPoint()) - elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment: - if self.continous_silence_frame_count * frm_shift_in_ms >= self.max_end_sil_frame_cnt_thresh: - lookback_frame = int(self.max_end_sil_frame_cnt_thresh / frm_shift_in_ms) - if self.vad_opts.do_extend: - lookback_frame -= int(self.vad_opts.lookahead_time_end_point / frm_shift_in_ms) - lookback_frame -= 1 - lookback_frame = max(0, lookback_frame) - self.OnVoiceEnd(cur_frm_idx - lookback_frame, False, False) - self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected - elif cur_frm_idx - self.confirmed_start_frame + 1 > \ - self.vad_opts.max_single_segment_time / frm_shift_in_ms: - self.OnVoiceEnd(cur_frm_idx, False, False) - self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected - elif self.vad_opts.do_extend and not is_final_frame: - if self.continous_silence_frame_count <= int( - self.vad_opts.lookahead_time_end_point / frm_shift_in_ms): - self.OnVoiceDetected(cur_frm_idx) - else: - self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx) - else: - pass - - if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected and \ - self.vad_opts.detect_mode == VadDetectMode.kVadMutipleUtteranceDetectMode.value: - self.ResetDetection() diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/frontend.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/frontend.py deleted file mode 100644 index 11a86445d..000000000 --- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/frontend.py +++ /dev/null @@ -1,191 +0,0 @@ -# -*- encoding: utf-8 -*- -from pathlib import Path -from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union - -import numpy as np -from typeguard import check_argument_types -import kaldi_native_fbank as knf - -root_dir = Path(__file__).resolve().parent - -logger_initialized = {} - - -class WavFrontend(): - """Conventional frontend structure for ASR. - """ - - def __init__( - self, - cmvn_file: str = None, - fs: int = 16000, - window: str = 'hamming', - n_mels: int = 80, - frame_length: int = 25, - frame_shift: int = 10, - lfr_m: int = 1, - lfr_n: int = 1, - dither: float = 1.0, - **kwargs, - ) -> None: - check_argument_types() - - opts = knf.FbankOptions() - opts.frame_opts.samp_freq = fs - opts.frame_opts.dither = dither - opts.frame_opts.window_type = window - opts.frame_opts.frame_shift_ms = float(frame_shift) - opts.frame_opts.frame_length_ms = float(frame_length) - opts.mel_opts.num_bins = n_mels - opts.energy_floor = 0 - opts.frame_opts.snip_edges = True - opts.mel_opts.debug_mel = False - self.opts = opts - - self.lfr_m = lfr_m - self.lfr_n = lfr_n - self.cmvn_file = cmvn_file - - if self.cmvn_file: - self.cmvn = self.load_cmvn() - self.fbank_fn = None - self.fbank_beg_idx = 0 - self.reset_status() - - def fbank(self, - waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: - waveform = waveform * (1 << 15) - self.fbank_fn = knf.OnlineFbank(self.opts) - self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist()) - frames = self.fbank_fn.num_frames_ready - mat = np.empty([frames, self.opts.mel_opts.num_bins]) - for i in range(frames): - mat[i, :] = self.fbank_fn.get_frame(i) - feat = mat.astype(np.float32) - feat_len = np.array(mat.shape[0]).astype(np.int32) - return feat, feat_len - - def fbank_online(self, - waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: - waveform = waveform * (1 << 15) - # self.fbank_fn = knf.OnlineFbank(self.opts) - self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist()) - frames = self.fbank_fn.num_frames_ready - mat = np.empty([frames, self.opts.mel_opts.num_bins]) - for i in range(self.fbank_beg_idx, frames): - mat[i, :] = self.fbank_fn.get_frame(i) - # self.fbank_beg_idx += (frames-self.fbank_beg_idx) - feat = mat.astype(np.float32) - feat_len = np.array(mat.shape[0]).astype(np.int32) - return feat, feat_len - - def reset_status(self): - self.fbank_fn = knf.OnlineFbank(self.opts) - self.fbank_beg_idx = 0 - - def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: - if self.lfr_m != 1 or self.lfr_n != 1: - feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n) - - if self.cmvn_file: - feat = self.apply_cmvn(feat) - - feat_len = np.array(feat.shape[0]).astype(np.int32) - return feat, feat_len - - @staticmethod - def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray: - LFR_inputs = [] - - T = inputs.shape[0] - T_lfr = int(np.ceil(T / lfr_n)) - left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1)) - inputs = np.vstack((left_padding, inputs)) - T = T + (lfr_m - 1) // 2 - for i in range(T_lfr): - if lfr_m <= T - i * lfr_n: - LFR_inputs.append( - (inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1)) - else: - # process last LFR frame - num_padding = lfr_m - (T - i * lfr_n) - frame = inputs[i * lfr_n:].reshape(-1) - for _ in range(num_padding): - frame = np.hstack((frame, inputs[-1])) - - LFR_inputs.append(frame) - LFR_outputs = np.vstack(LFR_inputs).astype(np.float32) - return LFR_outputs - - def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray: - """ - Apply CMVN with mvn data - """ - frame, dim = inputs.shape - means = np.tile(self.cmvn[0:1, :dim], (frame, 1)) - vars = np.tile(self.cmvn[1:2, :dim], (frame, 1)) - inputs = (inputs + means) * vars - return inputs - - def load_cmvn(self,) -> np.ndarray: - with open(self.cmvn_file, 'r', encoding='utf-8') as f: - lines = f.readlines() - - means_list = [] - vars_list = [] - for i in range(len(lines)): - line_item = lines[i].split() - if line_item[0] == '': - line_item = lines[i + 1].split() - if line_item[0] == '': - add_shift_line = line_item[3:(len(line_item) - 1)] - means_list = list(add_shift_line) - continue - elif line_item[0] == '': - line_item = lines[i + 1].split() - if line_item[0] == '': - rescale_line = line_item[3:(len(line_item) - 1)] - vars_list = list(rescale_line) - continue - - means = np.array(means_list).astype(np.float64) - vars = np.array(vars_list).astype(np.float64) - cmvn = np.array([means, vars]) - return cmvn - -def load_bytes(input): - middle_data = np.frombuffer(input, dtype=np.int16) - middle_data = np.asarray(middle_data) - if middle_data.dtype.kind not in 'iu': - raise TypeError("'middle_data' must be an array of integers") - dtype = np.dtype('float32') - if dtype.kind != 'f': - raise TypeError("'dtype' must be a floating point type") - - i = np.iinfo(middle_data.dtype) - abs_max = 2 ** (i.bits - 1) - offset = i.min + abs_max - array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32) - return array - - -def test(): - path = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav" - import librosa - cmvn_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/am.mvn" - config_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/config.yaml" - from funasr.runtime.python.onnxruntime.rapid_paraformer.utils.utils import read_yaml - config = read_yaml(config_file) - waveform, _ = librosa.load(path, sr=None) - frontend = WavFrontend( - cmvn_file=cmvn_file, - **config['frontend_conf'], - ) - speech, _ = frontend.fbank_online(waveform) #1d, (sample,), numpy - feat, feat_len = frontend.lfr_cmvn(speech) # 2d, (frame, 450), np.float32 -> torch, torch.from_numpy(), dtype, (1, frame, 450) - - frontend.reset_status() # clear cache - return feat, feat_len - -if __name__ == '__main__': - test() \ No newline at end of file diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/postprocess_utils.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/postprocess_utils.py deleted file mode 100644 index 575fb90dd..000000000 --- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/postprocess_utils.py +++ /dev/null @@ -1,240 +0,0 @@ -# Copyright (c) Alibaba, Inc. and its affiliates. - -import string -import logging -from typing import Any, List, Union - - -def isChinese(ch: str): - if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039': - return True - return False - - -def isAllChinese(word: Union[List[Any], str]): - word_lists = [] - for i in word: - cur = i.replace(' ', '') - cur = cur.replace('', '') - cur = cur.replace('', '') - word_lists.append(cur) - - if len(word_lists) == 0: - return False - - for ch in word_lists: - if isChinese(ch) is False: - return False - return True - - -def isAllAlpha(word: Union[List[Any], str]): - word_lists = [] - for i in word: - cur = i.replace(' ', '') - cur = cur.replace('', '') - cur = cur.replace('', '') - word_lists.append(cur) - - if len(word_lists) == 0: - return False - - for ch in word_lists: - if ch.isalpha() is False and ch != "'": - return False - elif ch.isalpha() is True and isChinese(ch) is True: - return False - - return True - - -# def abbr_dispose(words: List[Any]) -> List[Any]: -def abbr_dispose(words: List[Any], time_stamp: List[List] = None) -> List[Any]: - words_size = len(words) - word_lists = [] - abbr_begin = [] - abbr_end = [] - last_num = -1 - ts_lists = [] - ts_nums = [] - ts_index = 0 - for num in range(words_size): - if num <= last_num: - continue - - if len(words[num]) == 1 and words[num].encode('utf-8').isalpha(): - if num + 1 < words_size and words[ - num + 1] == ' ' and num + 2 < words_size and len( - words[num + - 2]) == 1 and words[num + - 2].encode('utf-8').isalpha(): - # found the begin of abbr - abbr_begin.append(num) - num += 2 - abbr_end.append(num) - # to find the end of abbr - while True: - num += 1 - if num < words_size and words[num] == ' ': - num += 1 - if num < words_size and len( - words[num]) == 1 and words[num].encode( - 'utf-8').isalpha(): - abbr_end.pop() - abbr_end.append(num) - last_num = num - else: - break - else: - break - - for num in range(words_size): - if words[num] == ' ': - ts_nums.append(ts_index) - else: - ts_nums.append(ts_index) - ts_index += 1 - last_num = -1 - for num in range(words_size): - if num <= last_num: - continue - - if num in abbr_begin: - if time_stamp is not None: - begin = time_stamp[ts_nums[num]][0] - word_lists.append(words[num].upper()) - num += 1 - while num < words_size: - if num in abbr_end: - word_lists.append(words[num].upper()) - last_num = num - break - else: - if words[num].encode('utf-8').isalpha(): - word_lists.append(words[num].upper()) - num += 1 - if time_stamp is not None: - end = time_stamp[ts_nums[num]][1] - ts_lists.append([begin, end]) - else: - word_lists.append(words[num]) - if time_stamp is not None and words[num] != ' ': - begin = time_stamp[ts_nums[num]][0] - end = time_stamp[ts_nums[num]][1] - ts_lists.append([begin, end]) - begin = end - - if time_stamp is not None: - return word_lists, ts_lists - else: - return word_lists - - -def sentence_postprocess(words: List[Any], time_stamp: List[List] = None): - middle_lists = [] - word_lists = [] - word_item = '' - ts_lists = [] - - # wash words lists - for i in words: - word = '' - if isinstance(i, str): - word = i - else: - word = i.decode('utf-8') - - if word in ['', '', '']: - continue - else: - middle_lists.append(word) - - # all chinese characters - if isAllChinese(middle_lists): - for i, ch in enumerate(middle_lists): - word_lists.append(ch.replace(' ', '')) - if time_stamp is not None: - ts_lists = time_stamp - - # all alpha characters - elif isAllAlpha(middle_lists): - ts_flag = True - for i, ch in enumerate(middle_lists): - if ts_flag and time_stamp is not None: - begin = time_stamp[i][0] - end = time_stamp[i][1] - word = '' - if '@@' in ch: - word = ch.replace('@@', '') - word_item += word - if time_stamp is not None: - ts_flag = False - end = time_stamp[i][1] - else: - word_item += ch - word_lists.append(word_item) - word_lists.append(' ') - word_item = '' - if time_stamp is not None: - ts_flag = True - end = time_stamp[i][1] - ts_lists.append([begin, end]) - begin = end - - # mix characters - else: - alpha_blank = False - ts_flag = True - begin = -1 - end = -1 - for i, ch in enumerate(middle_lists): - if ts_flag and time_stamp is not None: - begin = time_stamp[i][0] - end = time_stamp[i][1] - word = '' - if isAllChinese(ch): - if alpha_blank is True: - word_lists.pop() - word_lists.append(ch) - alpha_blank = False - if time_stamp is not None: - ts_flag = True - ts_lists.append([begin, end]) - begin = end - elif '@@' in ch: - word = ch.replace('@@', '') - word_item += word - alpha_blank = False - if time_stamp is not None: - ts_flag = False - end = time_stamp[i][1] - elif isAllAlpha(ch): - word_item += ch - word_lists.append(word_item) - word_lists.append(' ') - word_item = '' - alpha_blank = True - if time_stamp is not None: - ts_flag = True - end = time_stamp[i][1] - ts_lists.append([begin, end]) - begin = end - else: - raise ValueError('invalid character: {}'.format(ch)) - - if time_stamp is not None: - word_lists, ts_lists = abbr_dispose(word_lists, ts_lists) - real_word_lists = [] - for ch in word_lists: - if ch != ' ': - real_word_lists.append(ch) - sentence = ' '.join(real_word_lists).strip() - return sentence, ts_lists, real_word_lists - else: - word_lists = abbr_dispose(word_lists) - real_word_lists = [] - for ch in word_lists: - if ch != ' ': - real_word_lists.append(ch) - sentence = ''.join(word_lists).strip() - return sentence, real_word_lists diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/timestamp_utils.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/timestamp_utils.py deleted file mode 100644 index 3a01812e8..000000000 --- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/timestamp_utils.py +++ /dev/null @@ -1,59 +0,0 @@ -import numpy as np - - -def time_stamp_lfr6_onnx(us_cif_peak, char_list, begin_time=0.0, total_offset=-1.5): - if not len(char_list): - return [] - START_END_THRESHOLD = 5 - MAX_TOKEN_DURATION = 30 - TIME_RATE = 10.0 * 6 / 1000 / 3 # 3 times upsampled - cif_peak = us_cif_peak.reshape(-1) - num_frames = cif_peak.shape[-1] - if char_list[-1] == '': - char_list = char_list[:-1] - # char_list = [i for i in text] - timestamp_list = [] - new_char_list = [] - # for bicif model trained with large data, cif2 actually fires when a character starts - # so treat the frames between two peaks as the duration of the former token - fire_place = np.where(cif_peak>1.0-1e-4)[0] + total_offset # np format - num_peak = len(fire_place) - assert num_peak == len(char_list) + 1 # number of peaks is supposed to be number of tokens + 1 - # begin silence - if fire_place[0] > START_END_THRESHOLD: - # char_list.insert(0, '') - timestamp_list.append([0.0, fire_place[0]*TIME_RATE]) - new_char_list.append('') - # tokens timestamp - for i in range(len(fire_place)-1): - new_char_list.append(char_list[i]) - if i == len(fire_place)-2 or MAX_TOKEN_DURATION < 0 or fire_place[i+1] - fire_place[i] < MAX_TOKEN_DURATION: - timestamp_list.append([fire_place[i]*TIME_RATE, fire_place[i+1]*TIME_RATE]) - else: - # cut the duration to token and sil of the 0-weight frames last long - _split = fire_place[i] + MAX_TOKEN_DURATION - timestamp_list.append([fire_place[i]*TIME_RATE, _split*TIME_RATE]) - timestamp_list.append([_split*TIME_RATE, fire_place[i+1]*TIME_RATE]) - new_char_list.append('') - # tail token and end silence - if num_frames - fire_place[-1] > START_END_THRESHOLD: - _end = (num_frames + fire_place[-1]) / 2 - timestamp_list[-1][1] = _end*TIME_RATE - timestamp_list.append([_end*TIME_RATE, num_frames*TIME_RATE]) - new_char_list.append("") - else: - timestamp_list[-1][1] = num_frames*TIME_RATE - if begin_time: # add offset time in model with vad - for i in range(len(timestamp_list)): - timestamp_list[i][0] = timestamp_list[i][0] + begin_time / 1000.0 - timestamp_list[i][1] = timestamp_list[i][1] + begin_time / 1000.0 - assert len(new_char_list) == len(timestamp_list) - res_str = "" - for char, timestamp in zip(new_char_list, timestamp_list): - res_str += "{} {} {};".format(char, timestamp[0], timestamp[1]) - res = [] - for char, timestamp in zip(new_char_list, timestamp_list): - if char != '': - res.append([int(timestamp[0] * 1000), int(timestamp[1] * 1000)]) - return res_str, res - \ No newline at end of file diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/utils.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/utils.py deleted file mode 100644 index 2edde112e..000000000 --- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/utils.py +++ /dev/null @@ -1,257 +0,0 @@ -# -*- encoding: utf-8 -*- - -import functools -import logging -import pickle -from pathlib import Path -from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union - -import numpy as np -import yaml -from onnxruntime import (GraphOptimizationLevel, InferenceSession, - SessionOptions, get_available_providers, get_device) -from typeguard import check_argument_types - -import warnings - -root_dir = Path(__file__).resolve().parent - -logger_initialized = {} - - -class TokenIDConverter(): - def __init__(self, token_list: Union[List, str], - ): - check_argument_types() - - # self.token_list = self.load_token(token_path) - self.token_list = token_list - self.unk_symbol = token_list[-1] - - # @staticmethod - # def load_token(file_path: Union[Path, str]) -> List: - # if not Path(file_path).exists(): - # raise TokenIDConverterError(f'The {file_path} does not exist.') - # - # with open(str(file_path), 'rb') as f: - # token_list = pickle.load(f) - # - # if len(token_list) != len(set(token_list)): - # raise TokenIDConverterError('The Token exists duplicated symbol.') - # return token_list - - def get_num_vocabulary_size(self) -> int: - return len(self.token_list) - - def ids2tokens(self, - integers: Union[np.ndarray, Iterable[int]]) -> List[str]: - if isinstance(integers, np.ndarray) and integers.ndim != 1: - raise TokenIDConverterError( - f"Must be 1 dim ndarray, but got {integers.ndim}") - return [self.token_list[i] for i in integers] - - def tokens2ids(self, tokens: Iterable[str]) -> List[int]: - token2id = {v: i for i, v in enumerate(self.token_list)} - if self.unk_symbol not in token2id: - raise TokenIDConverterError( - f"Unknown symbol '{self.unk_symbol}' doesn't exist in the token_list" - ) - unk_id = token2id[self.unk_symbol] - return [token2id.get(i, unk_id) for i in tokens] - - -class CharTokenizer(): - def __init__( - self, - symbol_value: Union[Path, str, Iterable[str]] = None, - space_symbol: str = "", - remove_non_linguistic_symbols: bool = False, - ): - check_argument_types() - - self.space_symbol = space_symbol - self.non_linguistic_symbols = self.load_symbols(symbol_value) - self.remove_non_linguistic_symbols = remove_non_linguistic_symbols - - @staticmethod - def load_symbols(value: Union[Path, str, Iterable[str]] = None) -> Set: - if value is None: - return set() - - if isinstance(value, Iterable[str]): - return set(value) - - file_path = Path(value) - if not file_path.exists(): - logging.warning("%s doesn't exist.", file_path) - return set() - - with file_path.open("r", encoding="utf-8") as f: - return set(line.rstrip() for line in f) - - def text2tokens(self, line: Union[str, list]) -> List[str]: - tokens = [] - while len(line) != 0: - for w in self.non_linguistic_symbols: - if line.startswith(w): - if not self.remove_non_linguistic_symbols: - tokens.append(line[: len(w)]) - line = line[len(w):] - break - else: - t = line[0] - if t == " ": - t = "" - tokens.append(t) - line = line[1:] - return tokens - - def tokens2text(self, tokens: Iterable[str]) -> str: - tokens = [t if t != self.space_symbol else " " for t in tokens] - return "".join(tokens) - - def __repr__(self): - return ( - f"{self.__class__.__name__}(" - f'space_symbol="{self.space_symbol}"' - f'non_linguistic_symbols="{self.non_linguistic_symbols}"' - f")" - ) - - - -class Hypothesis(NamedTuple): - """Hypothesis data type.""" - - yseq: np.ndarray - score: Union[float, np.ndarray] = 0 - scores: Dict[str, Union[float, np.ndarray]] = dict() - states: Dict[str, Any] = dict() - - def asdict(self) -> dict: - """Convert data to JSON-friendly dict.""" - return self._replace( - yseq=self.yseq.tolist(), - score=float(self.score), - scores={k: float(v) for k, v in self.scores.items()}, - )._asdict() - - -class TokenIDConverterError(Exception): - pass - - -class ONNXRuntimeError(Exception): - pass - - -class OrtInferSession(): - def __init__(self, model_file, device_id=-1, intra_op_num_threads=4): - device_id = str(device_id) - sess_opt = SessionOptions() - sess_opt.intra_op_num_threads = intra_op_num_threads - sess_opt.log_severity_level = 4 - sess_opt.enable_cpu_mem_arena = False - sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL - - cuda_ep = 'CUDAExecutionProvider' - cuda_provider_options = { - "device_id": device_id, - "arena_extend_strategy": "kNextPowerOfTwo", - "cudnn_conv_algo_search": "EXHAUSTIVE", - "do_copy_in_default_stream": "true", - } - cpu_ep = 'CPUExecutionProvider' - cpu_provider_options = { - "arena_extend_strategy": "kSameAsRequested", - } - - EP_list = [] - if device_id != "-1" and get_device() == 'GPU' \ - and cuda_ep in get_available_providers(): - EP_list = [(cuda_ep, cuda_provider_options)] - EP_list.append((cpu_ep, cpu_provider_options)) - - self._verify_model(model_file) - self.session = InferenceSession(model_file, - sess_options=sess_opt, - providers=EP_list) - - if device_id != "-1" and cuda_ep not in self.session.get_providers(): - warnings.warn(f'{cuda_ep} is not avaiable for current env, the inference part is automatically shifted to be executed under {cpu_ep}.\n' - 'Please ensure the installed onnxruntime-gpu version matches your cuda and cudnn version, ' - 'you can check their relations from the offical web site: ' - 'https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html', - RuntimeWarning) - - def __call__(self, - input_content: List[Union[np.ndarray, np.ndarray]]) -> np.ndarray: - input_dict = dict(zip(self.get_input_names(), input_content)) - try: - return self.session.run(None, input_dict) - except Exception as e: - raise ONNXRuntimeError('ONNXRuntime inferece failed.') from e - - def get_input_names(self, ): - return [v.name for v in self.session.get_inputs()] - - def get_output_names(self,): - return [v.name for v in self.session.get_outputs()] - - def get_character_list(self, key: str = 'character'): - return self.meta_dict[key].splitlines() - - def have_key(self, key: str = 'character') -> bool: - self.meta_dict = self.session.get_modelmeta().custom_metadata_map - if key in self.meta_dict.keys(): - return True - return False - - @staticmethod - def _verify_model(model_path): - model_path = Path(model_path) - if not model_path.exists(): - raise FileNotFoundError(f'{model_path} does not exists.') - if not model_path.is_file(): - raise FileExistsError(f'{model_path} is not a file.') - - -def read_yaml(yaml_path: Union[str, Path]) -> Dict: - if not Path(yaml_path).exists(): - raise FileExistsError(f'The {yaml_path} does not exist.') - - with open(str(yaml_path), 'rb') as f: - data = yaml.load(f, Loader=yaml.Loader) - return data - - -@functools.lru_cache() -def get_logger(name='rapdi_paraformer'): - """Initialize and get a logger by name. - If the logger has not been initialized, this method will initialize the - logger by adding one or two handlers, otherwise the initialized logger will - be directly returned. During initialization, a StreamHandler will always be - added. - Args: - name (str): Logger name. - Returns: - logging.Logger: The expected logger. - """ - logger = logging.getLogger(name) - if name in logger_initialized: - return logger - - for logger_name in logger_initialized: - if name.startswith(logger_name): - return logger - - formatter = logging.Formatter( - '[%(asctime)s] %(name)s %(levelname)s: %(message)s', - datefmt="%Y/%m/%d %H:%M:%S") - - sh = logging.StreamHandler() - sh.setFormatter(formatter) - logger.addHandler(sh) - logger_initialized[name] = True - logger.propagate = False - return logger diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/vad_bin.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/vad_bin.py deleted file mode 100644 index 58913bbd3..000000000 --- a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/vad_bin.py +++ /dev/null @@ -1,166 +0,0 @@ -# -*- encoding: utf-8 -*- - -import os.path -from pathlib import Path -from typing import List, Union, Tuple - -import copy -import librosa -import numpy as np - -from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError, - OrtInferSession, TokenIDConverter, get_logger, - read_yaml) -from .utils.postprocess_utils import sentence_postprocess -from .utils.frontend import WavFrontend -from .utils.timestamp_utils import time_stamp_lfr6_onnx -from .utils.e2e_vad import E2EVadModel - -logging = get_logger() - - -class Fsmn_vad(): - def __init__(self, model_dir: Union[str, Path] = None, - batch_size: int = 1, - device_id: Union[str, int] = "-1", - quantize: bool = False, - intra_op_num_threads: int = 4, - max_end_sil: int = 800, - ): - - if not Path(model_dir).exists(): - raise FileNotFoundError(f'{model_dir} does not exist.') - - model_file = os.path.join(model_dir, 'model.onnx') - if quantize: - model_file = os.path.join(model_dir, 'model_quant.onnx') - config_file = os.path.join(model_dir, 'vad.yaml') - cmvn_file = os.path.join(model_dir, 'vad.mvn') - config = read_yaml(config_file) - - self.frontend = WavFrontend( - cmvn_file=cmvn_file, - **config['frontend_conf'] - ) - self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads) - self.batch_size = batch_size - self.vad_scorer = E2EVadModel(**config) - self.max_end_sil = max_end_sil - - def prepare_cache(self, in_cache: list = []): - if len(in_cache) > 0: - return in_cache - - for i in range(4): - cache = np.random.rand(1, 128, 19, 1).astype(np.float32) - in_cache.append(cache) - return in_cache - - - def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List: - waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq) - waveform_nums = len(waveform_list) - is_final = kwargs.get('kwargs', False) - - asr_res = [] - for beg_idx in range(0, waveform_nums, self.batch_size): - - end_idx = min(waveform_nums, beg_idx + self.batch_size) - waveform = waveform_list[beg_idx:end_idx] - feats, feats_len = self.extract_feat(waveform) - param_dict = kwargs.get('param_dict', dict()) - in_cache = param_dict.get('cache', list()) - in_cache = self.prepare_cache(in_cache) - try: - - scores, out_caches = self.infer(feats, *in_cache) - param_dict['cache'] = out_caches - segments = self.vad_scorer(scores, waveform, is_final=is_final, max_end_sil=self.max_end_sil) - - except ONNXRuntimeError: - # logging.warning(traceback.format_exc()) - logging.warning("input wav is silence or noise") - segments = '' - asr_res.append(segments) - # else: - # preds = self.decode(am_scores, valid_token_lens) - # - # asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens}) - - return asr_res - - def load_data(self, - wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List: - def load_wav(path: str) -> np.ndarray: - waveform, _ = librosa.load(path, sr=fs) - return waveform - - if isinstance(wav_content, np.ndarray): - return [wav_content] - - if isinstance(wav_content, str): - return [load_wav(wav_content)] - - if isinstance(wav_content, list): - return [load_wav(path) for path in wav_content] - - raise TypeError( - f'The type of {wav_content} is not in [str, np.ndarray, list]') - - def extract_feat(self, - waveform_list: List[np.ndarray] - ) -> Tuple[np.ndarray, np.ndarray]: - feats, feats_len = [], [] - for waveform in waveform_list: - speech, _ = self.frontend.fbank(waveform) - feat, feat_len = self.frontend.lfr_cmvn(speech) - feats.append(feat) - feats_len.append(feat_len) - - feats = self.pad_feats(feats, np.max(feats_len)) - feats_len = np.array(feats_len).astype(np.int32) - return feats, feats_len - - @staticmethod - def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray: - def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray: - pad_width = ((0, max_feat_len - cur_len), (0, 0)) - return np.pad(feat, pad_width, 'constant', constant_values=0) - - feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats] - feats = np.array(feat_res).astype(np.float32) - return feats - - def infer(self, feats: np.ndarray, - feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]: - outputs = self.ort_infer([feats, feats_len]) - return outputs - - def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]: - return [self.decode_one(am_score, token_num) - for am_score, token_num in zip(am_scores, token_nums)] - - def decode_one(self, - am_score: np.ndarray, - valid_token_num: int) -> List[str]: - yseq = am_score.argmax(axis=-1) - score = am_score.max(axis=-1) - score = np.sum(score, axis=-1) - - # pad with mask tokens to ensure compatibility with sos/eos tokens - # asr_model.sos:1 asr_model.eos:2 - yseq = np.array([1] + yseq.tolist() + [2]) - hyp = Hypothesis(yseq=yseq, score=score) - - # remove sos/eos and get results - last_pos = -1 - token_int = hyp.yseq[1:last_pos].tolist() - - # remove blank symbol id, which is assumed to be 0 - token_int = list(filter(lambda x: x not in (0, 2), token_int)) - - # Change integer-ids to tokens - token = self.converter.ids2tokens(token_int) - token = token[:valid_token_num - self.pred_bias] - # texts = sentence_postprocess(token) - return token diff --git a/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.2-py3.8.egg b/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.2-py3.8.egg deleted file mode 100644 index b24107b40..000000000 Binary files a/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.2-py3.8.egg and /dev/null differ diff --git a/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.3-py3.8.egg b/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.3-py3.8.egg deleted file mode 100644 index a7ccaf5d8..000000000 Binary files a/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.3-py3.8.egg and /dev/null differ