diff --git a/CHANGELOG.md b/CHANGELOG.md index e9efd9a..055d239 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,11 @@ # Change log for esp-sr +## 0.8.0 +support ESP32S3 chip +add wakenet7 & update wakenet5 to support multi-channel detection +remove wakenet6 +add AFE pipeline for speech recognition + ## 0.7.0 add chinese tts update noise suppression v2 diff --git a/CMakeLists.txt b/CMakeLists.txt index b3b5964..a1770c4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,7 @@ set(COMPONENT_ADD_INCLUDEDIRS speech_command_recognition/include acoustic_algorithm/include esp-tts/esp_tts_chinese/include + audio_front_end/include ) @@ -18,6 +19,7 @@ target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/wake_w target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/speech_command_recognition") target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/acoustic_algorithm") target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese") +target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/audio_front_end") IF (IDF_VER MATCHES "v4.") add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/libmultinet.a" PRIV_REQUIRES esp-sr) @@ -28,7 +30,7 @@ ENDIF (IDF_VER MATCHES "v4.") if(IDF_TARGET STREQUAL "esp32") target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group" wakenet - dl_lib + dl_lib_esp32 c_speech_features hilexin_wn3 hilexin_wn4 @@ -59,3 +61,15 @@ target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group" voice_set_template_esp32s2 "-Wl,--end-group") endif() + + +if(IDF_TARGET STREQUAL "esp32s3beta") +target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group" + wakenet + dl_lib_esp32s3 + c_speech_features + wakeword_model + customized_word_wn5 + esp_audio_front_end + "-Wl,--end-group") +endif() diff --git a/acoustic_algorithm/libesp_audio_processor.a b/acoustic_algorithm/libesp_audio_processor.a index f5d3fdc..8c20cc1 100644 Binary files a/acoustic_algorithm/libesp_audio_processor.a and b/acoustic_algorithm/libesp_audio_processor.a differ diff --git a/audio_front_end/component.mk b/audio_front_end/component.mk new file mode 100644 index 0000000..ef00b03 --- /dev/null +++ b/audio_front_end/component.mk @@ -0,0 +1,11 @@ +COMPONENT_ADD_INCLUDEDIRS := include + +COMPONENT_SRCDIRS := . + +LIB_FILES := $(shell ls $(COMPONENT_PATH)/lib*.a) + +LIBS := $(patsubst lib%.a,-l%,$(notdir $(LIB_FILES))) + +COMPONENT_ADD_LDFLAGS += -L$(COMPONENT_PATH)/ $(LIBS) + +ALL_LIB_FILES += $(LIB_FILES) diff --git a/audio_front_end/include/esp_afe_sr_iface.h b/audio_front_end/include/esp_afe_sr_iface.h new file mode 100644 index 0000000..9901a3a --- /dev/null +++ b/audio_front_end/include/esp_afe_sr_iface.h @@ -0,0 +1,150 @@ +#pragma once +#include "stdint.h" +#include "esp_wn_iface.h" +#include "esp_wn_models.h" + +//AFE: Audio Front-End +//SR: Speech Recognition +//afe_sr/AFE_SR: the audio front-end for speech recognition + +//Opaque AFE_SR data container +typedef struct esp_afe_sr_data_t esp_afe_sr_data_t; + +//Set AFE_SR mode +typedef enum { + SR_MODE_LOW_COST = 0, //LOW_COST, low memory consumption and CPU loading + SR_MODE_MEDIUM = 1, //MEDIUM + SR_MODE_HIGH_PERF = 2, //HIGH_PERF +} afe_sr_mode_t; + +/** + * @brief Function to initialze a AFE_SR instance with a specified mode + * + * @param mode The mode of AFE_SR + * @param perferred_core The perferred core to be pinned. + * If all task in AFE_SR can not run in real time by only one core, the another core would be used. + * @returns Handle to the AFE_SR data + */ +typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_t)(afe_sr_mode_t mode, int perferred_core); + +/** + * @brief Get the amount of each channel samples per frame that need to be passed to the function + * + * Every speech enhancement AFE_SR processes a certain number of samples at the same time. This function + * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes. + * + * @param afe The AFE_SR object to query + * @return The amount of samples to feed the fetch function + */ +typedef int (*esp_afe_sr_iface_op_get_frame_chunksize_t)(esp_afe_sr_data_t *afe); + +/** + * @brief Get the channel number of samples that need to be passed to the fetch function + * + * @param afe The AFE_SR object to query + * @return The amount of samples to feed the fetch function + */ +typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe); + +/** + * @brief Get the sample rate of the samples to feed to the function + * + * @param afe The AFE_SR object to query + * @return The sample rate, in hz + */ +typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe); + +/** + * @brief Feed samples of an audio stream to the AFE_SR + * + * @Warning The input data should be arranged in the format of [CH0_0, CH1_0, ..., CHN_0, CH0_1, CH0_1, ..., CHN_1, ...]. + * The last channel is reference signal or far-end signal. + * + * @param afe The AFE_SR object to query + * @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the + * `get_frame_chunksize`. The channel number can be queried `get_channel_num`. + * @return The size of input + */ +typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in); + +/** + * @brief fetch enhanced samples of an audio stream from the AFE_SR + * + * @Warning The output is single channel data, no matter how many channels the input is. + * + * @param afe The AFE_SR object to query + * @param out The output enhanced signal. The frame size can be queried by the `get_frame_chunksize`. + * @return The style of output, -1: noise, 0: speech, 1: wake word 1, 2: wake word 2, ... + */ +typedef int (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe, int16_t* out); + +/** + * @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient + * when wakenet has been initialized. + * + * @param afe The AFE_SR object to query + * @param wakenet The pointer of wakenet + * @param model_coeff The coefficient of wake word model + * @return 0: fail, 1: success + */ +typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, + esp_wn_iface_t *wakenet, + const model_coeff_getter_t *model_coeff); + +/** + * @brief Disable wakenet model. + * + * @param afe The AFE_SR object to query + * @return 0: fail, 1: success + */ +typedef int (*esp_afe_sr_iface_op_disable_wakenet_t)(esp_afe_sr_data_t *afe); + +/** + * @brief Enable wakenet model. + * + * @param afe The AFE_SR object to query + * @return 0: fail, 1: success + */ +typedef int (*esp_afe_sr_iface_op_enable_wakenet_t)(esp_afe_sr_data_t *afe); + +/** + * @brief Disable AEC algorithm. + * + * @param afe The AFE_SR object to query + * @return 0: fail, 1: success + */ +typedef int (*esp_afe_sr_iface_op_disable_aec_t)(esp_afe_sr_data_t *afe); + +/** + * @brief Enable AEC algorithm. + * + * @param afe The AFE_SR object to query + * @return 0: fail, 1: success + */ +typedef int (*esp_afe_sr_iface_op_enable_aec_t)(esp_afe_sr_data_t *afe); + +/** + * @brief Destroy a AFE_SR instance + * + * @param afe AFE_SR object to destroy + */ +typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe); + + +/** + * This structure contains the functions used to do operations on a AFE_SR. + */ +typedef struct { + esp_afe_sr_iface_op_create_t create; + esp_afe_sr_iface_op_feed_t feed; + esp_afe_sr_iface_op_fetch_t fetch; + esp_afe_sr_iface_op_get_frame_chunksize_t get_frame_chunksize; + esp_afe_sr_iface_op_get_channel_num_t get_channel_num; + esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate; + esp_afe_sr_iface_op_set_wakenet_t set_wakenet; + esp_afe_sr_iface_op_disable_wakenet_t disable_wakenet; + esp_afe_sr_iface_op_enable_wakenet_t enable_wakenet; + esp_afe_sr_iface_op_disable_aec_t disable_aec; + esp_afe_sr_iface_op_enable_aec_t enable_aec; + esp_afe_sr_iface_op_destroy_t destroy; +} esp_afe_sr_iface_t; diff --git a/audio_front_end/include/esp_afe_sr_models.h b/audio_front_end/include/esp_afe_sr_models.h new file mode 100644 index 0000000..5424134 --- /dev/null +++ b/audio_front_end/include/esp_afe_sr_models.h @@ -0,0 +1,6 @@ +#pragma once +#include "esp_afe_sr_iface.h" + +extern const esp_afe_sr_iface_t esp_afe_sr_2mic; +extern const esp_afe_sr_iface_t esp_afe_sr_1mic; + diff --git a/audio_front_end/libesp_audio_front_end.a b/audio_front_end/libesp_audio_front_end.a new file mode 100644 index 0000000..b04d8c6 Binary files /dev/null and b/audio_front_end/libesp_audio_front_end.a differ diff --git a/lib/libc_speech_features.a b/lib/libc_speech_features.a index 4e665a5..a182848 100644 Binary files a/lib/libc_speech_features.a and b/lib/libc_speech_features.a differ diff --git a/lib/libdl_lib.a b/lib/libdl_lib_esp32.a similarity index 100% rename from lib/libdl_lib.a rename to lib/libdl_lib_esp32.a diff --git a/lib/libdl_lib_esp32s3.a b/lib/libdl_lib_esp32s3.a new file mode 100644 index 0000000..a133339 Binary files /dev/null and b/lib/libdl_lib_esp32s3.a differ diff --git a/lib/libwakenet.a b/lib/libwakenet.a index c2e83b8..a75cf22 100644 Binary files a/lib/libwakenet.a and b/lib/libwakenet.a differ diff --git a/wake_word_engine/include/esp_wn_iface.h b/wake_word_engine/include/esp_wn_iface.h index 8cd9fcc..08c7494 100644 --- a/wake_word_engine/include/esp_wn_iface.h +++ b/wake_word_engine/include/esp_wn_iface.h @@ -10,7 +10,11 @@ typedef struct model_iface_data_t model_iface_data_t; //As a consequence also the false alarm rate goes up typedef enum { DET_MODE_90 = 0, //Normal, response accuracy rate about 90% - DET_MODE_95 //Aggressive, response accuracy rate about 95% + DET_MODE_95 = 1, //Aggressive, response accuracy rate about 95% + DET_MODE_2CH_90 = 2, + DET_MODE_2CH_95 = 3, + DET_MODE_3CH_90 = 4, + DET_MODE_3CH_95 = 5, } det_mode_t; typedef struct { @@ -27,18 +31,6 @@ typedef struct { */ typedef model_iface_data_t* (*esp_wn_iface_op_create_t)(const model_coeff_getter_t *model_coeff, det_mode_t det_mode); -/** - * @brief Function type to initialze a model instance with a detection mode and specified wake word coefficient - * - * Warning: Just wakeNet6 support this function to select which core to run neural network. - * - * @param det_mode The wake words detection mode to trigger wake words, DET_MODE_90 or DET_MODE_95 - * @param model_coeff The specified wake word model coefficient - * @param core Core to run neural network - * @returns Handle to the model data - */ -typedef model_iface_data_t* (*esp_wn_iface_op_create_pinned_to_core_t)(const model_coeff_getter_t *model_coeff, det_mode_t det_mode, int core); - /** * @brief Callback function type to fetch the amount of samples that need to be passed to the detect function * @@ -50,6 +42,17 @@ typedef model_iface_data_t* (*esp_wn_iface_op_create_pinned_to_core_t)(const mod */ typedef int (*esp_wn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model); +/** + * @brief Callback function type to fetch the channel number of samples that need to be passed to the detect function + * + * Every speech recognition model processes a certain number of samples at the same time. This function + * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes. + * + * @param model The model object to query + * @return The amount of samples to feed the detect function + */ +typedef int (*esp_wn_iface_op_get_channel_num_t)(model_iface_data_t *model); + /** * @brief Get the sample rate of the samples to feed to the detect function @@ -109,6 +112,23 @@ typedef float (*esp_wn_iface_op_get_det_threshold_t)(model_iface_data_t *model, */ typedef int (*esp_wn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples); +/** + * @brief Get the volume gain + * + * @param model The model object to query + * @param target_db The target dB to calculate volume gain + * @returns the volume gain + */ +typedef float (*esp_wn_iface_op_get_vol_gain_t)(model_iface_data_t *model, float target_db); + +/** + * @brief Get the triggered channel index. Channel index starts from zero + * + * @param model The model object to query + * @return The channel index + */ +typedef int (*esp_wn_iface_op_get_triggered_channel_t)(model_iface_data_t *model); + /** * @brief Destroy a speech recognition model * @@ -122,13 +142,15 @@ typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model); */ typedef struct { esp_wn_iface_op_create_t create; - esp_wn_iface_op_create_pinned_to_core_t create_pinned_to_core; esp_wn_iface_op_get_samp_chunksize_t get_samp_chunksize; + esp_wn_iface_op_get_channel_num_t get_channel_num; esp_wn_iface_op_get_samp_rate_t get_samp_rate; esp_wn_iface_op_get_word_num_t get_word_num; esp_wn_iface_op_get_word_name_t get_word_name; esp_wn_iface_op_set_det_threshold_t set_det_threshold; esp_wn_iface_op_get_det_threshold_t get_det_threshold; + esp_wn_iface_op_get_triggered_channel_t get_triggered_channel; + esp_wn_iface_op_get_vol_gain_t get_vol_gain; esp_wn_iface_op_detect_t detect; esp_wn_iface_op_destroy_t destroy; } esp_wn_iface_t; diff --git a/wake_word_engine/include/esp_wn_models.h b/wake_word_engine/include/esp_wn_models.h index 26751ec..af2da79 100644 --- a/wake_word_engine/include/esp_wn_models.h +++ b/wake_word_engine/include/esp_wn_models.h @@ -4,25 +4,19 @@ //Contains declarations of all available speech recognion models. Pair this up with the right coefficients and you have a model that can recognize //a specific phrase or word. -extern const esp_wn_iface_t esp_sr_wakenet3_quantized; -extern const esp_wn_iface_t esp_sr_wakenet4_quantized; extern const esp_wn_iface_t esp_sr_wakenet5_quantized; -extern const esp_wn_iface_t esp_sr_wakenet5_float; -extern const esp_wn_iface_t esp_sr_wakenet6_quantized; +extern const esp_wn_iface_t esp_sr_wakenet7_quantized; +extern const esp_wn_iface_t esp_sr_wakenet7_quantized8; /* Configure network to use based on what's selected in menuconfig. */ -#if CONFIG_SR_MODEL_WN3_QUANT -#define WAKENET_MODEL esp_sr_wakenet3_quantized -#elif CONFIG_SR_MODEL_WN4_QUANT -#define WAKENET_MODEL esp_sr_wakenet4_quantized -#elif CONFIG_SR_MODEL_WN5_FLOAT -#define WAKENET_MODEL esp_sr_wakenet5_float -#elif CONFIG_SR_MODEL_WN5_QUANT +#if CONFIG_SR_MODEL_WN5_QUANT #define WAKENET_MODEL esp_sr_wakenet5_quantized -#elif CONFIG_SR_MODEL_WN6_QUANT -#define WAKENET_MODEL esp_sr_wakenet6_quantized +#elif CONFIG_SR_MODEL_WN7_QUANT +#define WAKENET_MODEL esp_sr_wakenet7_quantized +#elif CONFIG_SR_MODEL_WN7_QUANT8 +#define WAKENET_MODEL esp_sr_wakenet7_quantized8 #else #error No valid neural network model selected. #endif @@ -30,19 +24,7 @@ extern const esp_wn_iface_t esp_sr_wakenet6_quantized; /* Configure wake word to use based on what's selected in menuconfig. */ -#if CONFIG_SR_WN3_HILEXIN -#include "hilexin_wn3.h" -#define WAKENET_COEFF get_coeff_hilexin_wn3 - -#elif CONFIG_SR_WN4_HILEXIN -#include "hilexin_wn4.h" -#define WAKENET_COEFF get_coeff_hilexin_wn4 - -#elif CONFIG_SR_WN5_HILEXIN & CONFIG_SR_MODEL_WN5_FLOAT -#include "hilexin_wn5_float.h" -#define WAKENET_COEFF get_coeff_hilexin_wn5_float - -#elif CONFIG_SR_WN5_HILEXIN & CONFIG_SR_MODEL_WN5_QUANT +#if CONFIG_SR_WN5_HILEXIN & CONFIG_SR_MODEL_WN5_QUANT #include "hilexin_wn5.h" #define WAKENET_COEFF get_coeff_hilexin_wn5 @@ -74,17 +56,37 @@ extern const esp_wn_iface_t esp_sr_wakenet6_quantized; #include "hijeson_wn5X3.h" #define WAKENET_COEFF get_coeff_hijeson_wn5X3 -#elif CONFIG_SR_WN6_NIHAOXIAOXIN -#include "nihaoxiaoxin_wn6.h" -#define WAKENET_COEFF get_coeff_nihaoxiaoxin_wn6 - #elif CONFIG_SR_WN5_CUSTOMIZED_WORD #include "customized_word_wn5.h" #define WAKENET_COEFF get_coeff_customized_word_wn5 -#elif CONFIG_SR_WN6_CUSTOMIZED_WORD -#include "customized_word_wn6.h" -#define WAKENET_COEFF get_coeff_customized_word_wn6 +#elif CONFIG_SR_WN7_CUSTOMIZED_WORD +#include "customized_word_wn7.h" +#define WAKENET_COEFF get_coeff_customized_word_wn7 + +#elif CONFIG_SR_WN7_XIAOAITONGXUE & CONFIG_SR_MODEL_WN7_QUANT +#include "xiaoaitongxue_wn7.h" +#define WAKENET_COEFF get_coeff_xiaoaitongxue_wn7 + +#elif CONFIG_SR_WN7_XIAOAITONGXUE & CONFIG_SR_MODEL_WN7_QUANT8 +#include "xiaoaitongxue_wn7_q8.h" +#define WAKENET_COEFF get_coeff_xiaoaitongxue_wn7_q8 + +#elif CONFIG_SR_WN7_HILEXIN & CONFIG_SR_MODEL_WN7_QUANT +#include "hilexin_wn7.h" +#define WAKENET_COEFF get_coeff_hilexin_wn7 + +#elif CONFIG_SR_WN7_HILEXIN & CONFIG_SR_MODEL_WN7_QUANT8 +#include "hilexin_wn7_q8.h" +#define WAKENET_COEFF get_coeff_hilexin_wn7_q8 + +#elif CONFIG_SR_WN7_ALEXA & CONFIG_SR_MODEL_WN7_QUANT +#include "alexa_wn7.h" +#define WAKENET_COEFF get_coeff_alexa_wn7 + +#elif CONFIG_SR_WN7_ALEXA & CONFIG_SR_MODEL_WN7_QUANT8 +#include "alexa_wn7_q8.h" +#define WAKENET_COEFF get_coeff_alexa_wn7_q8 #else #error No valid wake word selected. diff --git a/wake_word_engine/libwakeword_model.a b/wake_word_engine/libwakeword_model.a new file mode 100644 index 0000000..2948bf3 Binary files /dev/null and b/wake_word_engine/libwakeword_model.a differ