diff --git a/funasr/runtime/onnxruntime/CMakeLists.txt b/funasr/runtime/onnxruntime/CMakeLists.txt index 4ffe0f3cc..6feef9288 100644 --- a/funasr/runtime/onnxruntime/CMakeLists.txt +++ b/funasr/runtime/onnxruntime/CMakeLists.txt @@ -2,24 +2,27 @@ cmake_minimum_required(VERSION 3.10) project(FunASRonnx) -set(CMAKE_CXX_STANDARD 11) +# set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ version to be used.") set(CMAKE_POSITION_INDEPENDENT_CODE ON) +include(TestBigEndian) +test_big_endian(BIG_ENDIAN) +if(BIG_ENDIAN) + message("Big endian system") +else() + message("Little endian system") +endif() + # for onnxruntime - IF(WIN32) - - if(CMAKE_CL_64) link_directories(${ONNXRUNTIME_DIR}\\lib) else() add_definitions(-D_WIN_X86) endif() ELSE() - - -link_directories(${ONNXRUNTIME_DIR}/lib) - + link_directories(${ONNXRUNTIME_DIR}/lib) endif() add_subdirectory("./third_party/yaml-cpp") diff --git a/funasr/runtime/onnxruntime/include/Audio.h b/funasr/runtime/onnxruntime/include/Audio.h index da5e82cc7..ec49a9f82 100644 --- a/funasr/runtime/onnxruntime/include/Audio.h +++ b/funasr/runtime/onnxruntime/include/Audio.h @@ -6,6 +6,13 @@ #include #include +#ifndef model_sample_rate +#define model_sample_rate 16000 +#endif +#ifndef WAV_HEADER_SIZE +#define WAV_HEADER_SIZE 44 +#endif + using namespace std; class AudioFrame { @@ -32,7 +39,6 @@ class Audio { int16_t *speech_buff; int speech_len; int speech_align_len; - int16_t sample_rate; int offset; float align_size; int data_type; @@ -43,10 +49,11 @@ class Audio { Audio(int data_type, int size); ~Audio(); void disp(); - bool loadwav(const char* filename); - bool loadwav(const char* buf, int nLen); - bool loadpcmwav(const char* buf, int nFileLen); - bool loadpcmwav(const char* filename); + bool loadwav(const char* filename, int32_t* sampling_rate); + void wavResample(int32_t sampling_rate, const float *waveform, int32_t n); + bool loadwav(const char* buf, int nLen, int32_t* sampling_rate); + bool loadpcmwav(const char* buf, int nFileLen, int32_t* sampling_rate); + bool loadpcmwav(const char* filename, int32_t* sampling_rate); int fetch_chunck(float *&dout, int len); int fetch(float *&dout, int &len, int &flag); void padding(); diff --git a/funasr/runtime/onnxruntime/include/libfunasrapi.h b/funasr/runtime/onnxruntime/include/libfunasrapi.h index 6e81fa995..9bc37e762 100644 --- a/funasr/runtime/onnxruntime/include/libfunasrapi.h +++ b/funasr/runtime/onnxruntime/include/libfunasrapi.h @@ -55,9 +55,9 @@ _FUNASRAPI FUNASR_HANDLE FunASRInit(const char* szModelDir, int nThread, bool q // if not give a fnCallback ,it should be NULL _FUNASRAPI FUNASR_RESULT FunASRRecogBuffer(FUNASR_HANDLE handle, const char* szBuf, int nLen, FUNASR_MODE Mode, QM_CALLBACK fnCallback); -_FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* szBuf, int nLen, FUNASR_MODE Mode, QM_CALLBACK fnCallback); +_FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* szBuf, int nLen, int sampling_rate, FUNASR_MODE Mode, QM_CALLBACK fnCallback); -_FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* szFileName, FUNASR_MODE Mode, QM_CALLBACK fnCallback); +_FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* szFileName, int sampling_rate, FUNASR_MODE Mode, QM_CALLBACK fnCallback); _FUNASRAPI FUNASR_RESULT FunASRRecogFile(FUNASR_HANDLE handle, const char* szWavfile, FUNASR_MODE Mode, QM_CALLBACK fnCallback); diff --git a/funasr/runtime/onnxruntime/src/Audio.cpp b/funasr/runtime/onnxruntime/src/Audio.cpp index bce3a9006..38b6de84f 100644 --- a/funasr/runtime/onnxruntime/src/Audio.cpp +++ b/funasr/runtime/onnxruntime/src/Audio.cpp @@ -3,11 +3,96 @@ #include #include #include +#include +#include #include "Audio.h" +#include "precomp.h" using namespace std; +// see http://soundfile.sapp.org/doc/WaveFormat/ +// Note: We assume little endian here +struct WaveHeader { + bool Validate() const { + // F F I R + if (chunk_id != 0x46464952) { + printf("Expected chunk_id RIFF. Given: 0x%08x\n", chunk_id); + return false; + } + // E V A W + if (format != 0x45564157) { + printf("Expected format WAVE. Given: 0x%08x\n", format); + return false; + } + + if (subchunk1_id != 0x20746d66) { + printf("Expected subchunk1_id 0x20746d66. Given: 0x%08x\n", + subchunk1_id); + return false; + } + + if (subchunk1_size != 16) { // 16 for PCM + printf("Expected subchunk1_size 16. Given: %d\n", + subchunk1_size); + return false; + } + + if (audio_format != 1) { // 1 for PCM + printf("Expected audio_format 1. Given: %d\n", audio_format); + return false; + } + + if (num_channels != 1) { // we support only single channel for now + printf("Expected single channel. Given: %d\n", num_channels); + return false; + } + if (byte_rate != (sample_rate * num_channels * bits_per_sample / 8)) { + return false; + } + + if (block_align != (num_channels * bits_per_sample / 8)) { + return false; + } + + if (bits_per_sample != 16) { // we support only 16 bits per sample + printf("Expected bits_per_sample 16. Given: %d\n", + bits_per_sample); + return false; + } + return true; + } + + // See https://en.wikipedia.org/wiki/WAV#Metadata and + // https://www.robotplanet.dk/audio/wav_meta_data/riff_mci.pdf + void SeekToDataChunk(std::istream &is) { + // a t a d + while (is && subchunk2_id != 0x61746164) { + // const char *p = reinterpret_cast(&subchunk2_id); + // printf("Skip chunk (%x): %c%c%c%c of size: %d\n", subchunk2_id, p[0], + // p[1], p[2], p[3], subchunk2_size); + is.seekg(subchunk2_size, std::istream::cur); + is.read(reinterpret_cast(&subchunk2_id), sizeof(int32_t)); + is.read(reinterpret_cast(&subchunk2_size), sizeof(int32_t)); + } + } + + int32_t chunk_id; + int32_t chunk_size; + int32_t format; + int32_t subchunk1_id; + int32_t subchunk1_size; + int16_t audio_format; + int16_t num_channels; + int32_t sample_rate; + int32_t byte_rate; + int16_t block_align; + int16_t bits_per_sample; + int32_t subchunk2_id; // a tag of this chunk + int32_t subchunk2_size; // size of subchunk2 +}; +static_assert(sizeof(WaveHeader) == WAV_HEADER_SIZE, ""); + class AudioWindow { private: int *window; @@ -56,7 +141,7 @@ int AudioFrame::set_end(int val, int max_len) float frame_length = 400; float frame_shift = 160; float num_new_samples = - ceil((num_samples - 400) / frame_shift) * frame_shift + frame_length; + ceil((num_samples - frame_length) / frame_shift) * frame_shift + frame_length; end = start + num_new_samples; len = (int)num_new_samples; @@ -111,120 +196,150 @@ Audio::~Audio() void Audio::disp() { - printf("Audio time is %f s. len is %d\n", (float)speech_len / 16000, + printf("Audio time is %f s. len is %d\n", (float)speech_len / model_sample_rate, speech_len); } float Audio::get_time_len() { - return (float)speech_len / 16000; - //speech_len); + return (float)speech_len / model_sample_rate; } -bool Audio::loadwav(const char *filename) +void Audio::wavResample(int32_t sampling_rate, const float *waveform, + int32_t n) { + printf( + "Creating a resampler:\n" + " in_sample_rate: %d\n" + " output_sample_rate: %d\n", + sampling_rate, static_cast(model_sample_rate)); + float min_freq = + std::min(sampling_rate, model_sample_rate); + float lowpass_cutoff = 0.99 * 0.5 * min_freq; + int32_t lowpass_filter_width = 6; + //FIXME + //auto resampler = new LinearResample( + // sampling_rate, model_sample_rate, lowpass_cutoff, lowpass_filter_width); + auto resampler = std::make_unique( + sampling_rate, model_sample_rate, lowpass_cutoff, lowpass_filter_width); + std::vector samples; + resampler->Resample(waveform, n, true, &samples); + //reset speech_data + speech_len = samples.size(); + if (speech_data != NULL) { + free(speech_data); + } + speech_data = (float*)malloc(sizeof(float) * speech_len); + memset(speech_data, 0, sizeof(float) * speech_len); + copy(samples.begin(), samples.end(), speech_data); +} + +bool Audio::loadwav(const char *filename, int32_t* sampling_rate) +{ + WaveHeader header; if (speech_data != NULL) { free(speech_data); } if (speech_buff != NULL) { free(speech_buff); } - + offset = 0; - - FILE *fp; - fp = fopen(filename, "rb"); - if (fp == nullptr) + std::ifstream is(filename, std::ifstream::binary); + is.read(reinterpret_cast(&header), sizeof(header)); + if(!is){ + fprintf(stderr, "Failed to read %s\n", filename); return false; - fseek(fp, 0, SEEK_END); /*定位到文件末尾*/ - uint32_t nFileLen = ftell(fp); /*得到文件大小*/ - fseek(fp, 44, SEEK_SET); /*跳过wav文件头*/ - - speech_len = (nFileLen - 44) / 2; - speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size); - speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_align_len); + } + + *sampling_rate = header.sample_rate; + // header.subchunk2_size contains the number of bytes in the data. + // As we assume each sample contains two bytes, so it is divided by 2 here + speech_len = header.subchunk2_size / 2; + speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_len); if (speech_buff) { - memset(speech_buff, 0, sizeof(int16_t) * speech_align_len); - int ret = fread(speech_buff, sizeof(int16_t), speech_len, fp); - fclose(fp); + memset(speech_buff, 0, sizeof(int16_t) * speech_len); + is.read(reinterpret_cast(speech_buff), header.subchunk2_size); + if (!is) { + fprintf(stderr, "Failed to read %s\n", filename); + return false; + } + speech_data = (float*)malloc(sizeof(float) * speech_len); + memset(speech_data, 0, sizeof(float) * speech_len); - speech_data = (float*)malloc(sizeof(float) * speech_align_len); - memset(speech_data, 0, sizeof(float) * speech_align_len); - int i; float scale = 1; - if (data_type == 1) { scale = 32768; } - - for (i = 0; i < speech_len; i++) { + for (int32_t i = 0; i != speech_len; ++i) { speech_data[i] = (float)speech_buff[i] / scale; } + //resample + if(*sampling_rate != model_sample_rate){ + wavResample(*sampling_rate, speech_data, speech_len); + } + AudioFrame* frame = new AudioFrame(speech_len); frame_queue.push(frame); - return true; } else return false; } - -bool Audio::loadwav(const char* buf, int nFileLen) +bool Audio::loadwav(const char* buf, int nFileLen, int32_t* sampling_rate) { - - - + WaveHeader header; if (speech_data != NULL) { free(speech_data); } if (speech_buff != NULL) { free(speech_buff); } - offset = 0; - size_t nOffset = 0; + std::memcpy(&header, buf, sizeof(header)); -#define WAV_HEADER_SIZE 44 - - speech_len = (nFileLen - WAV_HEADER_SIZE) / 2; - speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size); - speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_align_len); + *sampling_rate = header.sample_rate; + speech_len = header.subchunk2_size / 2; + speech_buff = (int16_t *)malloc(sizeof(int16_t) * speech_len); if (speech_buff) { - memset(speech_buff, 0, sizeof(int16_t) * speech_align_len); + memset(speech_buff, 0, sizeof(int16_t) * speech_len); memcpy((void*)speech_buff, (const void*)(buf + WAV_HEADER_SIZE), speech_len * sizeof(int16_t)); + speech_data = (float*)malloc(sizeof(float) * speech_len); + memset(speech_data, 0, sizeof(float) * speech_len); - speech_data = (float*)malloc(sizeof(float) * speech_align_len); - memset(speech_data, 0, sizeof(float) * speech_align_len); - int i; float scale = 1; - if (data_type == 1) { scale = 32768; } - for (i = 0; i < speech_len; i++) { + for (int32_t i = 0; i != speech_len; ++i) { speech_data[i] = (float)speech_buff[i] / scale; } + + //resample + if(*sampling_rate != model_sample_rate){ + wavResample(*sampling_rate, speech_data, speech_len); + } + AudioFrame* frame = new AudioFrame(speech_len); + frame_queue.push(frame); return true; } else return false; - } - -bool Audio::loadpcmwav(const char* buf, int nBufLen) +bool Audio::loadpcmwav(const char* buf, int nBufLen, int32_t* sampling_rate) { if (speech_data != NULL) { free(speech_data); @@ -234,33 +349,29 @@ bool Audio::loadpcmwav(const char* buf, int nBufLen) } offset = 0; - size_t nOffset = 0; - - - speech_len = nBufLen / 2; - speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size); - speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_align_len); + speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len); if (speech_buff) { - memset(speech_buff, 0, sizeof(int16_t) * speech_align_len); + memset(speech_buff, 0, sizeof(int16_t) * speech_len); memcpy((void*)speech_buff, (const void*)buf, speech_len * sizeof(int16_t)); + speech_data = (float*)malloc(sizeof(float) * speech_len); + memset(speech_data, 0, sizeof(float) * speech_len); - speech_data = (float*)malloc(sizeof(float) * speech_align_len); - memset(speech_data, 0, sizeof(float) * speech_align_len); - - - int i; float scale = 1; - if (data_type == 1) { scale = 32768; } - for (i = 0; i < speech_len; i++) { + for (int32_t i = 0; i != speech_len; ++i) { speech_data[i] = (float)speech_buff[i] / scale; } + + //resample + if(*sampling_rate != model_sample_rate){ + wavResample(*sampling_rate, speech_data, speech_len); + } AudioFrame* frame = new AudioFrame(speech_len); frame_queue.push(frame); @@ -269,13 +380,10 @@ bool Audio::loadpcmwav(const char* buf, int nBufLen) } else return false; - - } -bool Audio::loadpcmwav(const char* filename) +bool Audio::loadpcmwav(const char* filename, int32_t* sampling_rate) { - if (speech_data != NULL) { free(speech_data); } @@ -293,34 +401,31 @@ bool Audio::loadpcmwav(const char* filename) fseek(fp, 0, SEEK_SET); speech_len = (nFileLen) / 2; - speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size); - speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_align_len); + speech_buff = (int16_t*)malloc(sizeof(int16_t) * speech_len); if (speech_buff) { - memset(speech_buff, 0, sizeof(int16_t) * speech_align_len); + memset(speech_buff, 0, sizeof(int16_t) * speech_len); int ret = fread(speech_buff, sizeof(int16_t), speech_len, fp); fclose(fp); - speech_data = (float*)malloc(sizeof(float) * speech_align_len); - memset(speech_data, 0, sizeof(float) * speech_align_len); + speech_data = (float*)malloc(sizeof(float) * speech_len); + memset(speech_data, 0, sizeof(float) * speech_len); - - - int i; float scale = 1; - if (data_type == 1) { scale = 32768; } - - for (i = 0; i < speech_len; i++) { + for (int32_t i = 0; i != speech_len; ++i) { speech_data[i] = (float)speech_buff[i] / scale; } + //resample + if(*sampling_rate != model_sample_rate){ + wavResample(*sampling_rate, speech_data, speech_len); + } AudioFrame* frame = new AudioFrame(speech_len); frame_queue.push(frame); - return true; } @@ -329,7 +434,6 @@ bool Audio::loadpcmwav(const char* filename) } - int Audio::fetch_chunck(float *&dout, int len) { if (offset >= speech_align_len) { diff --git a/funasr/runtime/onnxruntime/src/CMakeLists.txt b/funasr/runtime/onnxruntime/src/CMakeLists.txt index c07aac518..d41fcd0e1 100644 --- a/funasr/runtime/onnxruntime/src/CMakeLists.txt +++ b/funasr/runtime/onnxruntime/src/CMakeLists.txt @@ -1,5 +1,6 @@ file(GLOB files1 "*.cpp") +file(GLOB files2 "*.cc") file(GLOB files4 "paraformer/*.cpp") set(files ${files1} ${files2} ${files3} ${files4}) diff --git a/funasr/runtime/onnxruntime/src/Vocab.cpp b/funasr/runtime/onnxruntime/src/Vocab.cpp index af6312bc9..b54a6c675 100644 --- a/funasr/runtime/onnxruntime/src/Vocab.cpp +++ b/funasr/runtime/onnxruntime/src/Vocab.cpp @@ -13,21 +13,6 @@ Vocab::Vocab(const char *filename) { ifstream in(filename); loadVocabFromYaml(filename); - - /* - string line; - if (in) // 有该文件 - { - while (getline(in, line)) // line中不包括每行的换行符 - { - vocab.push_back(line); - } - } - else{ - printf("Cannot load vocab from: %s, there must be file vocab.txt", filename); - exit(-1); - } - */ } Vocab::~Vocab() { diff --git a/funasr/runtime/onnxruntime/src/libfunasrapi.cpp b/funasr/runtime/onnxruntime/src/libfunasrapi.cpp index 0d77d20d7..a2ecf1081 100644 --- a/funasr/runtime/onnxruntime/src/libfunasrapi.cpp +++ b/funasr/runtime/onnxruntime/src/libfunasrapi.cpp @@ -17,8 +17,9 @@ extern "C" { if (!pRecogObj) return nullptr; + int32_t sampling_rate = -1; Audio audio(1); - if (!audio.loadwav(szBuf, nLen)) + if (!audio.loadwav(szBuf, nLen, &sampling_rate)) return nullptr; //audio.split(); @@ -41,14 +42,14 @@ extern "C" { return pResult; } - _FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* szBuf, int nLen, FUNASR_MODE Mode, QM_CALLBACK fnCallback) + _FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* szBuf, int nLen, int sampling_rate, FUNASR_MODE Mode, QM_CALLBACK fnCallback) { Model* pRecogObj = (Model*)handle; if (!pRecogObj) return nullptr; Audio audio(1); - if (!audio.loadpcmwav(szBuf, nLen)) + if (!audio.loadpcmwav(szBuf, nLen, &sampling_rate)) return nullptr; //audio.split(); @@ -71,14 +72,14 @@ extern "C" { return pResult; } - _FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* szFileName, FUNASR_MODE Mode, QM_CALLBACK fnCallback) + _FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* szFileName, int sampling_rate, FUNASR_MODE Mode, QM_CALLBACK fnCallback) { Model* pRecogObj = (Model*)handle; if (!pRecogObj) return nullptr; Audio audio(1); - if (!audio.loadpcmwav(szFileName)) + if (!audio.loadpcmwav(szFileName, &sampling_rate)) return nullptr; //audio.split(); @@ -106,9 +107,10 @@ extern "C" { Model* pRecogObj = (Model*)handle; if (!pRecogObj) return nullptr; - + + int32_t sampling_rate = -1; Audio audio(1); - if(!audio.loadwav(szWavfile)) + if(!audio.loadwav(szWavfile, &sampling_rate)) return nullptr; //audio.split(); diff --git a/funasr/runtime/onnxruntime/src/paraformer_onnx.cpp b/funasr/runtime/onnxruntime/src/paraformer_onnx.cpp index 678cdf676..0d9c65809 100644 --- a/funasr/runtime/onnxruntime/src/paraformer_onnx.cpp +++ b/funasr/runtime/onnxruntime/src/paraformer_onnx.cpp @@ -70,7 +70,6 @@ ModelImp::~ModelImp() void ModelImp::reset() { - printf("Not Imp!!!!!!\n"); } void ModelImp::apply_lfr(Tensor*& din) diff --git a/funasr/runtime/onnxruntime/src/precomp.h b/funasr/runtime/onnxruntime/src/precomp.h index 678a3e444..3aeed142f 100644 --- a/funasr/runtime/onnxruntime/src/precomp.h +++ b/funasr/runtime/onnxruntime/src/precomp.h @@ -44,6 +44,7 @@ using namespace std; #include "FeatureQueue.h" #include "SpeechWrap.h" #include +#include "resample.h" #include "Model.h" #include "paraformer_onnx.h" #include "libfunasrapi.h" diff --git a/funasr/runtime/onnxruntime/src/resample.cc b/funasr/runtime/onnxruntime/src/resample.cc new file mode 100644 index 000000000..0238752c3 --- /dev/null +++ b/funasr/runtime/onnxruntime/src/resample.cc @@ -0,0 +1,305 @@ +/** + * Copyright 2013 Pegah Ghahremani + * 2014 IMSL, PKU-HKUST (author: Wei Shi) + * 2014 Yanqing Sun, Junjie Wang + * 2014 Johns Hopkins University (author: Daniel Povey) + * Copyright 2023 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// this file is copied and modified from +// kaldi/src/feat/resample.cc + +#include "resample.h" + +#include +#include +#include + +#include +#include + +#ifndef M_2PI +#define M_2PI 6.283185307179586476925286766559005 +#endif + +#ifndef M_PI +#define M_PI 3.1415926535897932384626433832795 +#endif + +template +I Gcd(I m, I n) { + // this function is copied from kaldi/src/base/kaldi-math.h + if (m == 0 || n == 0) { + if (m == 0 && n == 0) { // gcd not defined, as all integers are divisors. + fprintf(stderr, "Undefined GCD since m = 0, n = 0.\n"); + exit(-1); + } + return (m == 0 ? (n > 0 ? n : -n) : (m > 0 ? m : -m)); + // return absolute value of whichever is nonzero + } + // could use compile-time assertion + // but involves messing with complex template stuff. + static_assert(std::is_integral::value, ""); + while (1) { + m %= n; + if (m == 0) return (n > 0 ? n : -n); + n %= m; + if (n == 0) return (m > 0 ? m : -m); + } +} + +/// Returns the least common multiple of two integers. Will +/// crash unless the inputs are positive. +template +I Lcm(I m, I n) { + // This function is copied from kaldi/src/base/kaldi-math.h + assert(m > 0 && n > 0); + I gcd = Gcd(m, n); + return gcd * (m / gcd) * (n / gcd); +} + +static float DotProduct(const float *a, const float *b, int32_t n) { + float sum = 0; + for (int32_t i = 0; i != n; ++i) { + sum += a[i] * b[i]; + } + return sum; +} + +LinearResample::LinearResample(int32_t samp_rate_in_hz, + int32_t samp_rate_out_hz, float filter_cutoff_hz, + int32_t num_zeros) + : samp_rate_in_(samp_rate_in_hz), + samp_rate_out_(samp_rate_out_hz), + filter_cutoff_(filter_cutoff_hz), + num_zeros_(num_zeros) { + assert(samp_rate_in_hz > 0.0 && samp_rate_out_hz > 0.0 && + filter_cutoff_hz > 0.0 && filter_cutoff_hz * 2 <= samp_rate_in_hz && + filter_cutoff_hz * 2 <= samp_rate_out_hz && num_zeros > 0); + + // base_freq is the frequency of the repeating unit, which is the gcd + // of the input frequencies. + int32_t base_freq = Gcd(samp_rate_in_, samp_rate_out_); + input_samples_in_unit_ = samp_rate_in_ / base_freq; + output_samples_in_unit_ = samp_rate_out_ / base_freq; + + SetIndexesAndWeights(); + Reset(); +} + +void LinearResample::SetIndexesAndWeights() { + first_index_.resize(output_samples_in_unit_); + weights_.resize(output_samples_in_unit_); + + double window_width = num_zeros_ / (2.0 * filter_cutoff_); + + for (int32_t i = 0; i < output_samples_in_unit_; i++) { + double output_t = i / static_cast(samp_rate_out_); + double min_t = output_t - window_width, max_t = output_t + window_width; + // we do ceil on the min and floor on the max, because if we did it + // the other way around we would unnecessarily include indexes just + // outside the window, with zero coefficients. It's possible + // if the arguments to the ceil and floor expressions are integers + // (e.g. if filter_cutoff_ has an exact ratio with the sample rates), + // that we unnecessarily include something with a zero coefficient, + // but this is only a slight efficiency issue. + int32_t min_input_index = ceil(min_t * samp_rate_in_), + max_input_index = floor(max_t * samp_rate_in_), + num_indices = max_input_index - min_input_index + 1; + first_index_[i] = min_input_index; + weights_[i].resize(num_indices); + for (int32_t j = 0; j < num_indices; j++) { + int32_t input_index = min_input_index + j; + double input_t = input_index / static_cast(samp_rate_in_), + delta_t = input_t - output_t; + // sign of delta_t doesn't matter. + weights_[i][j] = FilterFunc(delta_t) / samp_rate_in_; + } + } +} + +/** Here, t is a time in seconds representing an offset from + the center of the windowed filter function, and FilterFunction(t) + returns the windowed filter function, described + in the header as h(t) = f(t)g(t), evaluated at t. +*/ +float LinearResample::FilterFunc(float t) const { + float window, // raised-cosine (Hanning) window of width + // num_zeros_/2*filter_cutoff_ + filter; // sinc filter function + if (fabs(t) < num_zeros_ / (2.0 * filter_cutoff_)) + window = 0.5 * (1 + cos(M_2PI * filter_cutoff_ / num_zeros_ * t)); + else + window = 0.0; // outside support of window function + if (t != 0) + filter = sin(M_2PI * filter_cutoff_ * t) / (M_PI * t); + else + filter = 2 * filter_cutoff_; // limit of the function at t = 0 + return filter * window; +} + +void LinearResample::Reset() { + input_sample_offset_ = 0; + output_sample_offset_ = 0; + input_remainder_.resize(0); +} + +void LinearResample::Resample(const float *input, int32_t input_dim, bool flush, + std::vector *output) { + int64_t tot_input_samp = input_sample_offset_ + input_dim, + tot_output_samp = GetNumOutputSamples(tot_input_samp, flush); + + assert(tot_output_samp >= output_sample_offset_); + + output->resize(tot_output_samp - output_sample_offset_); + + // samp_out is the index into the total output signal, not just the part + // of it we are producing here. + for (int64_t samp_out = output_sample_offset_; samp_out < tot_output_samp; + samp_out++) { + int64_t first_samp_in; + int32_t samp_out_wrapped; + GetIndexes(samp_out, &first_samp_in, &samp_out_wrapped); + const std::vector &weights = weights_[samp_out_wrapped]; + // first_input_index is the first index into "input" that we have a weight + // for. + int32_t first_input_index = + static_cast(first_samp_in - input_sample_offset_); + float this_output; + if (first_input_index >= 0 && + first_input_index + static_cast(weights.size()) <= input_dim) { + this_output = + DotProduct(input + first_input_index, weights.data(), weights.size()); + } else { // Handle edge cases. + this_output = 0.0; + for (int32_t i = 0; i < static_cast(weights.size()); i++) { + float weight = weights[i]; + int32_t input_index = first_input_index + i; + if (input_index < 0 && + static_cast(input_remainder_.size()) + input_index >= 0) { + this_output += + weight * input_remainder_[input_remainder_.size() + input_index]; + } else if (input_index >= 0 && input_index < input_dim) { + this_output += weight * input[input_index]; + } else if (input_index >= input_dim) { + // We're past the end of the input and are adding zero; should only + // happen if the user specified flush == true, or else we would not + // be trying to output this sample. + assert(flush); + } + } + } + int32_t output_index = + static_cast(samp_out - output_sample_offset_); + (*output)[output_index] = this_output; + } + + if (flush) { + Reset(); // Reset the internal state. + } else { + SetRemainder(input, input_dim); + input_sample_offset_ = tot_input_samp; + output_sample_offset_ = tot_output_samp; + } +} + +int64_t LinearResample::GetNumOutputSamples(int64_t input_num_samp, + bool flush) const { + // For exact computation, we measure time in "ticks" of 1.0 / tick_freq, + // where tick_freq is the least common multiple of samp_rate_in_ and + // samp_rate_out_. + int32_t tick_freq = Lcm(samp_rate_in_, samp_rate_out_); + int32_t ticks_per_input_period = tick_freq / samp_rate_in_; + + // work out the number of ticks in the time interval + // [ 0, input_num_samp/samp_rate_in_ ). + int64_t interval_length_in_ticks = input_num_samp * ticks_per_input_period; + if (!flush) { + float window_width = num_zeros_ / (2.0 * filter_cutoff_); + // To count the window-width in ticks we take the floor. This + // is because since we're looking for the largest integer num-out-samp + // that fits in the interval, which is open on the right, a reduction + // in interval length of less than a tick will never make a difference. + // For example, the largest integer in the interval [ 0, 2 ) and the + // largest integer in the interval [ 0, 2 - 0.9 ) are the same (both one). + // So when we're subtracting the window-width we can ignore the fractional + // part. + int32_t window_width_ticks = floor(window_width * tick_freq); + // The time-period of the output that we can sample gets reduced + // by the window-width (which is actually the distance from the + // center to the edge of the windowing function) if we're not + // "flushing the output". + interval_length_in_ticks -= window_width_ticks; + } + if (interval_length_in_ticks <= 0) return 0; + + int32_t ticks_per_output_period = tick_freq / samp_rate_out_; + // Get the last output-sample in the closed interval, i.e. replacing [ ) with + // [ ]. Note: integer division rounds down. See + // http://en.wikipedia.org/wiki/Interval_(mathematics) for an explanation of + // the notation. + int64_t last_output_samp = interval_length_in_ticks / ticks_per_output_period; + // We need the last output-sample in the open interval, so if it takes us to + // the end of the interval exactly, subtract one. + if (last_output_samp * ticks_per_output_period == interval_length_in_ticks) + last_output_samp--; + + // First output-sample index is zero, so the number of output samples + // is the last output-sample plus one. + int64_t num_output_samp = last_output_samp + 1; + return num_output_samp; +} + +// inline +void LinearResample::GetIndexes(int64_t samp_out, int64_t *first_samp_in, + int32_t *samp_out_wrapped) const { + // A unit is the smallest nonzero amount of time that is an exact + // multiple of the input and output sample periods. The unit index + // is the answer to "which numbered unit we are in". + int64_t unit_index = samp_out / output_samples_in_unit_; + // samp_out_wrapped is equal to samp_out % output_samples_in_unit_ + *samp_out_wrapped = + static_cast(samp_out - unit_index * output_samples_in_unit_); + *first_samp_in = + first_index_[*samp_out_wrapped] + unit_index * input_samples_in_unit_; +} + +void LinearResample::SetRemainder(const float *input, int32_t input_dim) { + std::vector old_remainder(input_remainder_); + // max_remainder_needed is the width of the filter from side to side, + // measured in input samples. you might think it should be half that, + // but you have to consider that you might be wanting to output samples + // that are "in the past" relative to the beginning of the latest + // input... anyway, storing more remainder than needed is not harmful. + int32_t max_remainder_needed = + ceil(samp_rate_in_ * num_zeros_ / filter_cutoff_); + input_remainder_.resize(max_remainder_needed); + for (int32_t index = -static_cast(input_remainder_.size()); + index < 0; index++) { + // we interpret "index" as an offset from the end of "input" and + // from the end of input_remainder_. + int32_t input_index = index + input_dim; + if (input_index >= 0) { + input_remainder_[index + static_cast(input_remainder_.size())] = + input[input_index]; + } else if (input_index + static_cast(old_remainder.size()) >= 0) { + input_remainder_[index + static_cast(input_remainder_.size())] = + old_remainder[input_index + + static_cast(old_remainder.size())]; + // else leave it at zero. + } + } +} diff --git a/funasr/runtime/onnxruntime/src/resample.h b/funasr/runtime/onnxruntime/src/resample.h new file mode 100644 index 000000000..b9a283ab8 --- /dev/null +++ b/funasr/runtime/onnxruntime/src/resample.h @@ -0,0 +1,137 @@ +/** + * Copyright 2013 Pegah Ghahremani + * 2014 IMSL, PKU-HKUST (author: Wei Shi) + * 2014 Yanqing Sun, Junjie Wang + * 2014 Johns Hopkins University (author: Daniel Povey) + * Copyright 2023 Xiaomi Corporation (authors: Fangjun Kuang) + * + * See LICENSE for clarification regarding multiple authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// this file is copied and modified from +// kaldi/src/feat/resample.h + +#include +#include + + +/* + We require that the input and output sampling rate be specified as + integers, as this is an easy way to specify that their ratio be rational. +*/ + +class LinearResample { + public: + /// Constructor. We make the input and output sample rates integers, because + /// we are going to need to find a common divisor. This should just remind + /// you that they need to be integers. The filter cutoff needs to be less + /// than samp_rate_in_hz/2 and less than samp_rate_out_hz/2. num_zeros + /// controls the sharpness of the filter, more == sharper but less efficient. + /// We suggest around 4 to 10 for normal use. + LinearResample(int32_t samp_rate_in_hz, int32_t samp_rate_out_hz, + float filter_cutoff_hz, int32_t num_zeros); + + /// Calling the function Reset() resets the state of the object prior to + /// processing a new signal; it is only necessary if you have called + /// Resample(x, x_size, false, y) for some signal, leading to a remainder of + /// the signal being called, but then abandon processing the signal before + /// calling Resample(x, x_size, true, y) for the last piece. Call it + /// unnecessarily between signals will not do any harm. + void Reset(); + + /// This function does the resampling. If you call it with flush == true and + /// you have never called it with flush == false, it just resamples the input + /// signal (it resizes the output to a suitable number of samples). + /// + /// You can also use this function to process a signal a piece at a time. + /// suppose you break it into piece1, piece2, ... pieceN. You can call + /// \code{.cc} + /// Resample(piece1, piece1_size, false, &output1); + /// Resample(piece2, piece2_size, false, &output2); + /// Resample(piece3, piece3_size, true, &output3); + /// \endcode + /// If you call it with flush == false, it won't output the last few samples + /// but will remember them, so that if you later give it a second piece of + /// the input signal it can process it correctly. + /// If your most recent call to the object was with flush == false, it will + /// have internal state; you can remove this by calling Reset(). + /// Empty input is acceptable. + void Resample(const float *input, int32_t input_dim, bool flush, + std::vector *output); + + //// Return the input and output sampling rates (for checks, for example) + int32_t GetInputSamplingRate() const { return samp_rate_in_; } + int32_t GetOutputSamplingRate() const { return samp_rate_out_; } + + private: + void SetIndexesAndWeights(); + + float FilterFunc(float) const; + + /// This function outputs the number of output samples we will output + /// for a signal with "input_num_samp" input samples. If flush == true, + /// we return the largest n such that + /// (n/samp_rate_out_) is in the interval [ 0, input_num_samp/samp_rate_in_ ), + /// and note that the interval is half-open. If flush == false, + /// define window_width as num_zeros / (2.0 * filter_cutoff_); + /// we return the largest n such that (n/samp_rate_out_) is in the interval + /// [ 0, input_num_samp/samp_rate_in_ - window_width ). + int64_t GetNumOutputSamples(int64_t input_num_samp, bool flush) const; + + /// Given an output-sample index, this function outputs to *first_samp_in the + /// first input-sample index that we have a weight on (may be negative), + /// and to *samp_out_wrapped the index into weights_ where we can get the + /// corresponding weights on the input. + inline void GetIndexes(int64_t samp_out, int64_t *first_samp_in, + int32_t *samp_out_wrapped) const; + + void SetRemainder(const float *input, int32_t input_dim); + + private: + // The following variables are provided by the user. + int32_t samp_rate_in_; + int32_t samp_rate_out_; + float filter_cutoff_; + int32_t num_zeros_; + + int32_t input_samples_in_unit_; ///< The number of input samples in the + ///< smallest repeating unit: num_samp_in_ = + ///< samp_rate_in_hz / Gcd(samp_rate_in_hz, + ///< samp_rate_out_hz) + + int32_t output_samples_in_unit_; ///< The number of output samples in the + ///< smallest repeating unit: num_samp_out_ + ///< = samp_rate_out_hz / + ///< Gcd(samp_rate_in_hz, samp_rate_out_hz) + + /// The first input-sample index that we sum over, for this output-sample + /// index. May be negative; any truncation at the beginning is handled + /// separately. This is just for the first few output samples, but we can + /// extrapolate the correct input-sample index for arbitrary output samples. + std::vector first_index_; + + /// Weights on the input samples, for this output-sample index. + std::vector> weights_; + + // the following variables keep track of where we are in a particular signal, + // if it is being provided over multiple calls to Resample(). + + int64_t input_sample_offset_; ///< The number of input samples we have + ///< already received for this signal + ///< (including anything in remainder_) + int64_t output_sample_offset_; ///< The number of samples we have already + ///< output for this signal. + std::vector input_remainder_; ///< A small trailing part of the + ///< previously seen input signal. +};