diff --git a/funasr/runtime/onnxruntime/CMakeLists.txt b/funasr/runtime/onnxruntime/CMakeLists.txt index 794818654..0e913858d 100644 --- a/funasr/runtime/onnxruntime/CMakeLists.txt +++ b/funasr/runtime/onnxruntime/CMakeLists.txt @@ -30,4 +30,3 @@ include_directories(${PROJECT_SOURCE_DIR}/kaldi-native-fbank) add_subdirectory("./third_party/yaml-cpp") add_subdirectory(kaldi-native-fbank/kaldi-native-fbank/csrc) add_subdirectory(src) -add_subdirectory(tester) diff --git a/funasr/runtime/onnxruntime/include/Audio.h b/funasr/runtime/onnxruntime/include/Audio.h index c38c31a5f..2667c315d 100644 --- a/funasr/runtime/onnxruntime/include/Audio.h +++ b/funasr/runtime/onnxruntime/include/Audio.h @@ -2,14 +2,10 @@ #ifndef AUDIO_H #define AUDIO_H -#include #include #include #include "Model.h" -#ifndef model_sample_rate -#define model_sample_rate 16000 -#endif #ifndef WAV_HEADER_SIZE #define WAV_HEADER_SIZE 44 #endif diff --git a/funasr/runtime/onnxruntime/include/ComDefine.h b/funasr/runtime/onnxruntime/include/ComDefine.h index f131e5ec3..6929e497e 100644 --- a/funasr/runtime/onnxruntime/include/ComDefine.h +++ b/funasr/runtime/onnxruntime/include/ComDefine.h @@ -8,4 +8,21 @@ #define S_ALL 3 #define S_ERR 4 +#ifndef MODEL_SAMPLE_RATE +#define MODEL_SAMPLE_RATE 16000 +#endif + +#ifndef VAD_SILENCE_DYRATION +#define VAD_SILENCE_DYRATION 15000 +#endif + +#ifndef VAD_MAX_LEN +#define VAD_MAX_LEN 800 +#endif + +#ifndef VAD_SPEECH_NOISE_THRES +#define VAD_SPEECH_NOISE_THRES 0.9 +#endif + + #endif diff --git a/funasr/runtime/onnxruntime/src/Audio.cpp b/funasr/runtime/onnxruntime/src/Audio.cpp index 72e90a285..5c10cf1ea 100644 --- a/funasr/runtime/onnxruntime/src/Audio.cpp +++ b/funasr/runtime/onnxruntime/src/Audio.cpp @@ -187,13 +187,13 @@ Audio::~Audio() void Audio::disp() { - printf("Audio time is %f s. len is %d\n", (float)speech_len / model_sample_rate, + printf("Audio time is %f s. len is %d\n", (float)speech_len / MODEL_SAMPLE_RATE, speech_len); } float Audio::get_time_len() { - return (float)speech_len / model_sample_rate; + return (float)speech_len / MODEL_SAMPLE_RATE; } void Audio::wavResample(int32_t sampling_rate, const float *waveform, @@ -203,9 +203,9 @@ void Audio::wavResample(int32_t sampling_rate, const float *waveform, "Creating a resampler:\n" " in_sample_rate: %d\n" " output_sample_rate: %d\n", - sampling_rate, static_cast(model_sample_rate)); + sampling_rate, static_cast(MODEL_SAMPLE_RATE)); float min_freq = - std::min(sampling_rate, model_sample_rate); + std::min(sampling_rate, MODEL_SAMPLE_RATE); float lowpass_cutoff = 0.99 * 0.5 * min_freq; int32_t lowpass_filter_width = 6; @@ -213,7 +213,7 @@ void Audio::wavResample(int32_t sampling_rate, const float *waveform, //auto resampler = new LinearResample( // sampling_rate, model_sample_rate, lowpass_cutoff, lowpass_filter_width); auto resampler = std::make_unique( - sampling_rate, model_sample_rate, lowpass_cutoff, lowpass_filter_width); + sampling_rate, MODEL_SAMPLE_RATE, lowpass_cutoff, lowpass_filter_width); std::vector samples; resampler->Resample(waveform, n, true, &samples); //reset speech_data @@ -270,7 +270,7 @@ bool Audio::loadwav(const char *filename, int32_t* sampling_rate) } //resample - if(*sampling_rate != model_sample_rate){ + if(*sampling_rate != MODEL_SAMPLE_RATE){ wavResample(*sampling_rate, speech_data, speech_len); } @@ -317,7 +317,7 @@ bool Audio::loadwav(const char* buf, int nFileLen, int32_t* sampling_rate) } //resample - if(*sampling_rate != model_sample_rate){ + if(*sampling_rate != MODEL_SAMPLE_RATE){ wavResample(*sampling_rate, speech_data, speech_len); } @@ -360,7 +360,7 @@ bool Audio::loadpcmwav(const char* buf, int nBufLen, int32_t* sampling_rate) } //resample - if(*sampling_rate != model_sample_rate){ + if(*sampling_rate != MODEL_SAMPLE_RATE){ wavResample(*sampling_rate, speech_data, speech_len); } @@ -411,7 +411,7 @@ bool Audio::loadpcmwav(const char* filename, int32_t* sampling_rate) } //resample - if(*sampling_rate != model_sample_rate){ + if(*sampling_rate != MODEL_SAMPLE_RATE){ wavResample(*sampling_rate, speech_data, speech_len); } @@ -511,7 +511,7 @@ void Audio::split(Model* pRecogObj) std::vector pcm_data(speech_data, speech_data+sp_len); vector> vad_segments = pRecogObj->vad_seg(pcm_data); - int seg_sample = model_sample_rate/1000; + int seg_sample = MODEL_SAMPLE_RATE/1000; for(vector segment:vad_segments) { frame = new AudioFrame(); diff --git a/funasr/runtime/onnxruntime/src/CMakeLists.txt b/funasr/runtime/onnxruntime/src/CMakeLists.txt index 6798b9957..18ff9cde8 100644 --- a/funasr/runtime/onnxruntime/src/CMakeLists.txt +++ b/funasr/runtime/onnxruntime/src/CMakeLists.txt @@ -1,25 +1,22 @@ file(GLOB files1 "*.cpp") file(GLOB files2 "*.cc") -file(GLOB files4 "paraformer/*.cpp") -set(files ${files1} ${files2} ${files3} ${files4}) - -# message("${files}") +set(files ${files1} ${files2}) +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin) add_library(funasr ${files}) if(WIN32) - - set(EXTRA_LIBS pthread yaml-cpp csrc) - if(CMAKE_CL_64) - target_link_directories(funasr PUBLIC ${CMAKE_SOURCE_DIR}/win/lib/x64) - else() - target_link_directories(funasr PUBLIC ${CMAKE_SOURCE_DIR}/win/lib/x86) - endif() - target_include_directories(funasr PUBLIC ${CMAKE_SOURCE_DIR}/win/include ) - - target_compile_definitions(funasr PUBLIC -D_FUNASR_API_EXPORT) + set(EXTRA_LIBS pthread yaml-cpp csrc) + if(CMAKE_CL_64) + target_link_directories(funasr PUBLIC ${CMAKE_SOURCE_DIR}/win/lib/x64) + else() + target_link_directories(funasr PUBLIC ${CMAKE_SOURCE_DIR}/win/lib/x86) + endif() + target_include_directories(funasr PUBLIC ${CMAKE_SOURCE_DIR}/win/include ) + + target_compile_definitions(funasr PUBLIC -D_FUNASR_API_EXPORT) else() set(EXTRA_LIBS pthread yaml-cpp csrc) @@ -38,4 +35,8 @@ endif() include_directories(${CMAKE_SOURCE_DIR}/include) target_link_libraries(funasr PUBLIC onnxruntime ${EXTRA_LIBS}) +add_executable(funasr-onnx-offline "funasr-onnx-offline.cpp") +add_executable(funasr-onnx-offline-rtf "funasr-onnx-offline-rtf.cpp") +target_link_libraries(funasr-onnx-offline PUBLIC funasr) +target_link_libraries(funasr-onnx-offline-rtf PUBLIC funasr) diff --git a/funasr/runtime/onnxruntime/src/FeatureQueue.cpp b/funasr/runtime/onnxruntime/src/FeatureQueue.cpp deleted file mode 100644 index f07633b42..000000000 --- a/funasr/runtime/onnxruntime/src/FeatureQueue.cpp +++ /dev/null @@ -1,59 +0,0 @@ -#include "precomp.h" -FeatureQueue::FeatureQueue() -{ - buff = new Tensor(67, 80); - window_size = 67; - buff_idx = 0; -} - -FeatureQueue::~FeatureQueue() -{ - delete buff; -} - -void FeatureQueue::reinit(int size) -{ - delete buff; - buff = new Tensor(size, 80); - buff_idx = 0; - window_size = size; -} - -void FeatureQueue::reset() -{ - buff_idx = 0; -} - -void FeatureQueue::push(float *din, int flag) -{ - int offset = buff_idx * 80; - memcpy(buff->buff + offset, din, 80 * sizeof(float)); - buff_idx++; - - if (flag == S_END) { - Tensor *tmp = new Tensor(buff_idx, 80); - memcpy(tmp->buff, buff->buff, buff_idx * 80 * sizeof(float)); - feature_queue.push(tmp); - buff_idx = 0; - } else if (buff_idx == window_size) { - feature_queue.push(buff); - Tensor *tmp = new Tensor(window_size, 80); - memcpy(tmp->buff, buff->buff + (window_size - 3) * 80, - 3 * 80 * sizeof(float)); - buff_idx = 3; - buff = tmp; - } -} - -Tensor *FeatureQueue::pop() -{ - - Tensor *tmp = feature_queue.front(); - feature_queue.pop(); - return tmp; -} - -int FeatureQueue::size() -{ - return feature_queue.size(); -} diff --git a/funasr/runtime/onnxruntime/src/FeatureQueue.h b/funasr/runtime/onnxruntime/src/FeatureQueue.h deleted file mode 100644 index be3360b49..000000000 --- a/funasr/runtime/onnxruntime/src/FeatureQueue.h +++ /dev/null @@ -1,28 +0,0 @@ - -#ifndef FEATUREQUEUE_H -#define FEATUREQUEUE_H - -#include "Tensor.h" -#include -#include -using namespace std; - - -class FeatureQueue { - private: - queue *> feature_queue; - Tensor *buff; - int buff_idx; - int window_size; - - public: - FeatureQueue(); - ~FeatureQueue(); - void reinit(int size); - void reset(); - void push(float *din, int flag); - Tensor *pop(); - int size(); -}; - -#endif diff --git a/funasr/runtime/onnxruntime/src/SpeechWrap.cpp b/funasr/runtime/onnxruntime/src/SpeechWrap.cpp deleted file mode 100644 index 60d0a2b70..000000000 --- a/funasr/runtime/onnxruntime/src/SpeechWrap.cpp +++ /dev/null @@ -1,39 +0,0 @@ -#include "precomp.h" - -SpeechWrap::SpeechWrap() -{ - cache_size = 0; -} - -SpeechWrap::~SpeechWrap() -{ -} - -void SpeechWrap::reset() -{ - cache_size = 0; -} - -void SpeechWrap::load(float *din, int len) -{ - in = din; - in_size = len; - total_size = cache_size + in_size; -} - -int SpeechWrap::size() -{ - return total_size; -} - -void SpeechWrap::update(int offset) -{ - int in_offset = offset - cache_size; - cache_size = (total_size - offset); - memcpy(cache, in + in_offset, cache_size * sizeof(float)); -} - -float &SpeechWrap::operator[](int i) -{ - return i < cache_size ? cache[i] : in[i - cache_size]; -} diff --git a/funasr/runtime/onnxruntime/src/SpeechWrap.h b/funasr/runtime/onnxruntime/src/SpeechWrap.h deleted file mode 100644 index 5d3ee4087..000000000 --- a/funasr/runtime/onnxruntime/src/SpeechWrap.h +++ /dev/null @@ -1,26 +0,0 @@ - -#ifndef SPEECHWRAP_H -#define SPEECHWRAP_H - -#include - -class SpeechWrap { - private: - float cache[400]; - int cache_size; - float *in; - int in_size; - int total_size; - int next_cache_size; - - public: - SpeechWrap(); - ~SpeechWrap(); - void load(float *din, int len); - void update(int offset); - void reset(); - int size(); - float &operator[](int i); -}; - -#endif diff --git a/funasr/runtime/onnxruntime/src/commonfunc.h b/funasr/runtime/onnxruntime/src/commonfunc.h index 5198030c0..8d1a97c94 100644 --- a/funasr/runtime/onnxruntime/src/commonfunc.h +++ b/funasr/runtime/onnxruntime/src/commonfunc.h @@ -1,6 +1,5 @@ #pragma once - typedef struct { std::string msg; @@ -11,8 +10,6 @@ typedef struct #ifdef _WIN32 #include - - inline std::wstring string2wstring(const std::string& str, const std::string& locale) { typedef std::codecvt_byname F; @@ -29,8 +26,6 @@ inline std::wstring strToWstr(std::string str) { #endif - - inline void getInputName(Ort::Session* session, string& inputName,int nIndex=0) { size_t numInputNodes = session->GetInputCount(); if (numInputNodes > 0) { diff --git a/funasr/runtime/onnxruntime/tester/tester_rtf.cpp b/funasr/runtime/onnxruntime/src/funasr-onnx-offline-rtf.cpp similarity index 100% rename from funasr/runtime/onnxruntime/tester/tester_rtf.cpp rename to funasr/runtime/onnxruntime/src/funasr-onnx-offline-rtf.cpp diff --git a/funasr/runtime/onnxruntime/tester/tester.cpp b/funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp similarity index 96% rename from funasr/runtime/onnxruntime/tester/tester.cpp rename to funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp index 4cb38dffc..fae739b2b 100644 --- a/funasr/runtime/onnxruntime/tester/tester.cpp +++ b/funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp @@ -6,9 +6,6 @@ #endif #include "libfunasrapi.h" - -#include -#include #include using namespace std; @@ -41,12 +38,10 @@ int main(int argc, char *argv[]) printf("Model initialization takes %lfs.\n", (double)modle_init_micros / 1000000); gettimeofday(&start, NULL); - float snippet_time = 0.0f; - FUNASR_RESULT Result=FunASRRecogFile(AsrHanlde, argv[2], RASR_NONE, NULL, use_vad); - gettimeofday(&end, NULL); - + + float snippet_time = 0.0f; if (Result) { string msg = FunASRGetResult(Result, 0); @@ -57,7 +52,7 @@ int main(int argc, char *argv[]) } else { - cout <<"no return data!"; + printf("no return data!"); } printf("Audio length %lfs.\n", (double)snippet_time); diff --git a/funasr/runtime/onnxruntime/src/paraformer_onnx.cpp b/funasr/runtime/onnxruntime/src/paraformer_onnx.cpp index 1e4a31058..1a86da665 100644 --- a/funasr/runtime/onnxruntime/src/paraformer_onnx.cpp +++ b/funasr/runtime/onnxruntime/src/paraformer_onnx.cpp @@ -14,7 +14,7 @@ ModelImp::ModelImp(const char* path,int nNumThread, bool quantize, bool use_vad) string vad_path = pathAppend(path, "vad_model.onnx"); string mvn_path = pathAppend(path, "vad.mvn"); vadHandle = make_unique(); - vadHandle->init_vad(vad_path, mvn_path, model_sample_rate, 800, 15000, 0.9); + vadHandle->init_vad(vad_path, mvn_path, MODEL_SAMPLE_RATE, VAD_MAX_LEN, VAD_SILENCE_DYRATION, VAD_SPEECH_NOISE_THRES); } if(quantize) @@ -29,7 +29,7 @@ ModelImp::ModelImp(const char* path,int nNumThread, bool quantize, bool use_vad) // knf options fbank_opts.frame_opts.dither = 0; fbank_opts.mel_opts.num_bins = 80; - fbank_opts.frame_opts.samp_freq = model_sample_rate; + fbank_opts.frame_opts.samp_freq = MODEL_SAMPLE_RATE; fbank_opts.frame_opts.window_type = "hamming"; fbank_opts.frame_opts.frame_shift_ms = 10; fbank_opts.frame_opts.frame_length_ms = 25; @@ -191,7 +191,7 @@ string ModelImp::forward(float* din, int len, int flag) { int32_t in_feat_dim = fbank_opts.mel_opts.num_bins; - std::vector wav_feats = FbankKaldi(model_sample_rate, din, len); + std::vector wav_feats = FbankKaldi(MODEL_SAMPLE_RATE, din, len); wav_feats = ApplyLFR(wav_feats); ApplyCMVN(&wav_feats); diff --git a/funasr/runtime/onnxruntime/src/precomp.h b/funasr/runtime/onnxruntime/src/precomp.h index 79225eeca..f936541d5 100644 --- a/funasr/runtime/onnxruntime/src/precomp.h +++ b/funasr/runtime/onnxruntime/src/precomp.h @@ -1,6 +1,5 @@ #pragma once // system - #include #include #include @@ -16,8 +15,6 @@ #include #include #include - - #include using namespace std; @@ -27,27 +24,19 @@ using namespace std; #include "kaldi-native-fbank/csrc/feature-fbank.h" #include "kaldi-native-fbank/csrc/online-feature.h" - // mine - +#include "ComDefine.h" #include "commonfunc.h" -#include #include "predefine_coe.h" #include "FsmnVad.h" - -#include -//#include "alignedmem.h" #include "Vocab.h" +#include "CommonStruct.h" +#include "Audio.h" #include "Tensor.h" #include "util.h" -#include "CommonStruct.h" -#include "FeatureQueue.h" -#include "SpeechWrap.h" -#include #include "resample.h" #include "Model.h" #include "paraformer_onnx.h" #include "libfunasrapi.h" - using namespace paraformer; diff --git a/funasr/runtime/onnxruntime/src/tmp.h b/funasr/runtime/onnxruntime/src/tmp.h deleted file mode 100644 index b57303f82..000000000 --- a/funasr/runtime/onnxruntime/src/tmp.h +++ /dev/null @@ -1,112 +0,0 @@ - -#ifndef WENETPARAMS_H -#define WENETPARAMS_H -// #pragma pack(1) - -#define vocab_size 5538 - -typedef struct { - float conv0_weight[512 * 9]; - float conv0_bias[512]; - - float conv1_weight[512 * 512 * 9]; - float conv1_bias[512]; - - float out0_weight[9728 * 512]; - float out0_bias[512]; - -} EncEmbedParams; - -typedef struct { - float linear_q_weight[512 * 512]; - float linear_q_bias[512]; - float linear_k_weight[512 * 512]; - float linear_k_bias[512]; - float linear_v_weight[512 * 512]; - float linear_v_bias[512]; - float linear_out_weight[512 * 512]; - float linear_out_bias[512]; -} SelfAttnParams; - -typedef struct { - SelfAttnParams linear0; - float linear_pos_weight[512 * 512]; - float pos_bias_u[512]; - float pos_bias_v[512]; - -} EncSelfAttnParams; - -typedef struct { - float w1_weight[512 * 2048]; - float w1_bias[2048]; - float w2_weight[2048 * 512]; - float w2_bias[512]; -} FeedForwardParams; - -typedef struct { - float weight[512]; - float bias[512]; -} NormParams; - -typedef struct { - float pointwise_conv1_weight[1024 * 512]; - float pointwise_conv1_bias[1024]; - - float depthwise_conv_weight[512 * 15]; - float depthwise_conv_bias[512]; - - float pointwise_conv2_weight[512 * 512]; - float pointwise_conv2_bias[512]; - NormParams norm; -} EncConvParams; - -typedef struct { - EncSelfAttnParams self_attn; - FeedForwardParams feedforward; - FeedForwardParams feedforward_macaron; - EncConvParams conv_module; - NormParams norm_ff; - NormParams norm_mha; - NormParams norm_macaron; - NormParams norm_conv; - NormParams norm_final; - // float concat_weight[1024 * 512]; - // float concat_bias[512]; -} SubEncoderParams; - -typedef struct { - EncEmbedParams embed; - SubEncoderParams sub_encoder[12]; - NormParams after_norm; -} EncoderParams; - -typedef struct { - SelfAttnParams self_attn; - SelfAttnParams src_attn; - FeedForwardParams feedward; - NormParams norm1; - NormParams norm2; - NormParams norm3; - // float concat_weight1[1024 * 512]; - // float concat_bias1[512]; - // float concat_weight2[1024 * 512]; - // float concat_bias2[512]; -} SubDecoderParams; - -typedef struct { - float embed_weight[vocab_size * 512]; - SubDecoderParams sub_decoder[6]; - NormParams after_norm; - float output_weight[vocab_size * 512]; - float output_bias[vocab_size]; -} DecoderParams; - -typedef struct { - EncoderParams encoder; - float ctc_weight[512 * vocab_size]; - float ctc_bias[vocab_size]; - DecoderParams decoder; -} WenetParams; - -// #pragma pack() -#endif diff --git a/funasr/runtime/onnxruntime/tester/CMakeLists.txt b/funasr/runtime/onnxruntime/tester/CMakeLists.txt deleted file mode 100644 index e3224e332..000000000 --- a/funasr/runtime/onnxruntime/tester/CMakeLists.txt +++ /dev/null @@ -1,23 +0,0 @@ - - -if(WIN32) - if(CMAKE_CL_64) - link_directories( ${CMAKE_SOURCE_DIR}/win/lib/x64 ) - else() - link_directories( ${CMAKE_SOURCE_DIR}/win/lib/x86 ) - endif() -endif() - -set(EXTRA_LIBS funasr) - - -include_directories(${CMAKE_SOURCE_DIR}/include) -set(EXECNAME "tester") -set(EXECNAMERTF "tester_rtf") - -add_executable(${EXECNAME} "tester.cpp") -target_link_libraries(${EXECNAME} PUBLIC ${EXTRA_LIBS}) - -add_executable(${EXECNAMERTF} "tester_rtf.cpp") -target_link_libraries(${EXECNAMERTF} PUBLIC ${EXTRA_LIBS}) -