feature/add AFE component & update wakenet

2025-09-15 15:28:44 +08:00 · 2021-02-25 11:01:29 +08:00 · 2021-02-25 11:01:29 +08:00 · d289680270
commit d289680270
parent c5896943ea
14 changed files with 259 additions and 48 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,11 @@
 # Change log for esp-sr

+## 0.8.0
+support ESP32S3 chip
+add wakenet7 & update wakenet5 to support multi-channel detection
+remove wakenet6
+add AFE pipeline for speech recognition 
+
 ## 0.7.0
 add chinese tts
 update noise suppression v2
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -8,6 +8,7 @@ set(COMPONENT_ADD_INCLUDEDIRS
    speech_command_recognition/include
    acoustic_algorithm/include
    esp-tts/esp_tts_chinese/include
+    audio_front_end/include
    )


@ -18,6 +19,7 @@ target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/wake_w
 target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/speech_command_recognition")
 target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/acoustic_algorithm")
 target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese")
+target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/audio_front_end")

 IF (IDF_VER MATCHES "v4.")
 add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/libmultinet.a" PRIV_REQUIRES esp-sr) 
@ -28,7 +30,7 @@ ENDIF (IDF_VER MATCHES "v4.")
 if(IDF_TARGET STREQUAL "esp32")
 target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
    wakenet
-    dl_lib
+    dl_lib_esp32
    c_speech_features
    hilexin_wn3
    hilexin_wn4
@ -59,3 +61,15 @@ target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
    voice_set_template_esp32s2
     "-Wl,--end-group")
 endif()
+
+
+if(IDF_TARGET STREQUAL "esp32s3beta")
+target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
+    wakenet
+    dl_lib_esp32s3
+    c_speech_features
+    wakeword_model
+    customized_word_wn5
+    esp_audio_front_end
+    "-Wl,--end-group")
+endif()
--- a/acoustic_algorithm/libesp_audio_processor.a
+++ b/acoustic_algorithm/libesp_audio_processor.a
--- a/audio_front_end/component.mk
+++ b/audio_front_end/component.mk
@ -0,0 +1,11 @@
+COMPONENT_ADD_INCLUDEDIRS := include
+
+COMPONENT_SRCDIRS := .
+
+LIB_FILES := $(shell ls $(COMPONENT_PATH)/lib*.a)
+
+LIBS := $(patsubst lib%.a,-l%,$(notdir $(LIB_FILES)))
+
+COMPONENT_ADD_LDFLAGS +=  -L$(COMPONENT_PATH)/ $(LIBS)
+
+ALL_LIB_FILES += $(LIB_FILES)
--- a/audio_front_end/include/esp_afe_sr_iface.h
+++ b/audio_front_end/include/esp_afe_sr_iface.h
@ -0,0 +1,150 @@
+#pragma once
+#include "stdint.h"
+#include "esp_wn_iface.h"
+#include "esp_wn_models.h"
+
+//AFE: Audio Front-End 
+//SR:  Speech Recognition
+//afe_sr/AFE_SR: the audio front-end for speech recognition
+
+//Opaque AFE_SR data container
+typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
+
+//Set AFE_SR mode
+typedef enum {
+	SR_MODE_LOW_COST = 0,            //LOW_COST, low memory consumption and CPU loading
+    SR_MODE_MEDIUM = 1,              //MEDIUM
+	SR_MODE_HIGH_PERF = 2,           //HIGH_PERF
+} afe_sr_mode_t;
+
+/**
+ * @brief Function to initialze a AFE_SR instance with a specified mode
+ * 
+ * @param mode              The mode of AFE_SR 
+ * @param perferred_core    The perferred core to be pinned. 
+ *                          If all task in AFE_SR can not run in real time by only one core, the another core would be used. 
+ * @returns Handle to the AFE_SR data
+ */
+typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_t)(afe_sr_mode_t mode, int perferred_core);
+
+/**
+ * @brief Get the amount of each channel samples per frame that need to be passed to the function
+ *
+ * Every speech enhancement AFE_SR processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param afe The AFE_SR object to query
+ * @return The amount of samples to feed the fetch function
+ */
+typedef int (*esp_afe_sr_iface_op_get_frame_chunksize_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Get the channel number of samples that need to be passed to the fetch function
+ * 
+ * @param afe The AFE_SR object to query
+ * @return The amount of samples to feed the fetch function
+ */
+typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the function
+ *
+ * @param afe The AFE_SR object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Feed samples of an audio stream to the AFE_SR
+ *
+ * @Warning  The input data should be arranged in the format of [CH0_0, CH1_0, ..., CHN_0, CH0_1, CH0_1, ..., CHN_1, ...].
+ *           The last channel is reference signal or far-end signal.
+ *
+ * @param afe   The AFE_SR object to query
+ * @param in    The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the 
+ *              `get_frame_chunksize`. The channel number can be queried `get_channel_num`.
+ * @return      The size of input
+ */
+typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in);
+
+/**
+ * @brief fetch enhanced samples of an audio stream from the AFE_SR
+ *
+ * @Warning  The output is single channel data, no matter how many channels the input is.
+ *
+ * @param afe   The AFE_SR object to query
+ * @param out   The output enhanced signal. The frame size can be queried by the `get_frame_chunksize`.
+ * @return      The style of output, -1: noise, 0: speech, 1: wake word 1, 2: wake word 2, ...
+ */
+typedef int (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe, int16_t* out);
+
+/**
+ * @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient 
+ *        when wakenet has been initialized.  
+ *
+ * @param afe          The AFE_SR object to query
+ * @param wakenet      The pointer of wakenet
+ * @param model_coeff  The coefficient of wake word model
+ * @return             0: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe,
+                                                  esp_wn_iface_t *wakenet, 
+                                                  const model_coeff_getter_t *model_coeff);
+
+/**
+ * @brief Disable wakenet model.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             0: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_disable_wakenet_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Enable wakenet model.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             0: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_enable_wakenet_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Disable AEC algorithm.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             0: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_disable_aec_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Enable AEC algorithm.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             0: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_enable_aec_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Destroy a AFE_SR instance
+ *
+ * @param afe         AFE_SR object to destroy
+ */
+typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
+
+
+/**
+ * This structure contains the functions used to do operations on a AFE_SR.
+ */
+typedef struct {
+    esp_afe_sr_iface_op_create_t create;
+    esp_afe_sr_iface_op_feed_t feed;
+    esp_afe_sr_iface_op_fetch_t fetch;
+    esp_afe_sr_iface_op_get_frame_chunksize_t get_frame_chunksize;
+    esp_afe_sr_iface_op_get_channel_num_t get_channel_num;
+    esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
+    esp_afe_sr_iface_op_set_wakenet_t  set_wakenet; 
+    esp_afe_sr_iface_op_disable_wakenet_t disable_wakenet;
+    esp_afe_sr_iface_op_enable_wakenet_t enable_wakenet;
+    esp_afe_sr_iface_op_disable_aec_t disable_aec;
+    esp_afe_sr_iface_op_enable_aec_t enable_aec;
+    esp_afe_sr_iface_op_destroy_t destroy;
+} esp_afe_sr_iface_t;
--- a/audio_front_end/include/esp_afe_sr_models.h
+++ b/audio_front_end/include/esp_afe_sr_models.h
@ -0,0 +1,6 @@
+#pragma once
+#include "esp_afe_sr_iface.h"
+
+extern const esp_afe_sr_iface_t esp_afe_sr_2mic;
+extern const esp_afe_sr_iface_t esp_afe_sr_1mic;
+
--- a/audio_front_end/libesp_audio_front_end.a
+++ b/audio_front_end/libesp_audio_front_end.a
--- a/lib/libc_speech_features.a
+++ b/lib/libc_speech_features.a
--- a/lib/libdl_lib_esp32.a
+++ b/lib/libdl_lib_esp32.a
--- a/lib/libdl_lib_esp32s3.a
+++ b/lib/libdl_lib_esp32s3.a
--- a/lib/libwakenet.a
+++ b/lib/libwakenet.a
--- a/wake_word_engine/include/esp_wn_iface.h
+++ b/wake_word_engine/include/esp_wn_iface.h
@ -10,7 +10,11 @@ typedef struct model_iface_data_t model_iface_data_t;
 //As a consequence also the false alarm rate goes up
 typedef enum {
 	DET_MODE_90 = 0,  //Normal, response accuracy rate about 90%
-	DET_MODE_95       //Aggressive, response accuracy rate about 95%
+	DET_MODE_95 = 1,       //Aggressive, response accuracy rate about 95%
+    DET_MODE_2CH_90 = 2,
+    DET_MODE_2CH_95 = 3,
+    DET_MODE_3CH_90 = 4,
+    DET_MODE_3CH_95 = 5,
 } det_mode_t;

 typedef struct {
@ -27,18 +31,6 @@ typedef struct {
 */
 typedef model_iface_data_t* (*esp_wn_iface_op_create_t)(const model_coeff_getter_t *model_coeff, det_mode_t det_mode);

-/**
- * @brief Function type to initialze a model instance with a detection mode and specified wake word coefficient
- *
- * Warning: Just wakeNet6 support this function to select which core to run neural network. 
- * 
- * @param det_mode    The wake words detection mode to trigger wake words, DET_MODE_90 or DET_MODE_95
- * @param model_coeff The specified wake word model coefficient
- * @param core        Core to run neural network
- * @returns Handle to the model data
- */
-typedef model_iface_data_t* (*esp_wn_iface_op_create_pinned_to_core_t)(const model_coeff_getter_t *model_coeff, det_mode_t det_mode, int core);
-
 /**
 * @brief Callback function type to fetch the amount of samples that need to be passed to the detect function
 *
@ -50,6 +42,17 @@ typedef model_iface_data_t* (*esp_wn_iface_op_create_pinned_to_core_t)(const mod
 */
 typedef int (*esp_wn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);

+/**
+ * @brief Callback function type to fetch the channel number of samples that need to be passed to the detect function
+ *
+ * Every speech recognition model processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_wn_iface_op_get_channel_num_t)(model_iface_data_t *model);
+

 /**
 * @brief Get the sample rate of the samples to feed to the detect function
@ -109,6 +112,23 @@ typedef float (*esp_wn_iface_op_get_det_threshold_t)(model_iface_data_t *model,
 */
 typedef int (*esp_wn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);

+/**
+ * @brief Get the volume gain
+ *
+ * @param model The model object to query
+ * @param target_db  The target dB to calculate volume gain
+ * @returns the volume gain
+ */
+typedef float (*esp_wn_iface_op_get_vol_gain_t)(model_iface_data_t *model, float target_db);
+
+/**
+ * @brief Get the triggered channel index. Channel index starts from zero
+ *
+ * @param model The model object to query
+ * @return The channel index
+ */
+typedef int (*esp_wn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
+
 /**
 * @brief Destroy a speech recognition model
 *
@ -122,13 +142,15 @@ typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model);
 */
 typedef struct {
    esp_wn_iface_op_create_t create;
-    esp_wn_iface_op_create_pinned_to_core_t create_pinned_to_core;
    esp_wn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_wn_iface_op_get_channel_num_t get_channel_num;
    esp_wn_iface_op_get_samp_rate_t get_samp_rate;
    esp_wn_iface_op_get_word_num_t get_word_num;
    esp_wn_iface_op_get_word_name_t get_word_name;
    esp_wn_iface_op_set_det_threshold_t set_det_threshold;
    esp_wn_iface_op_get_det_threshold_t get_det_threshold;
+    esp_wn_iface_op_get_triggered_channel_t  get_triggered_channel;
+    esp_wn_iface_op_get_vol_gain_t get_vol_gain;
    esp_wn_iface_op_detect_t detect;
    esp_wn_iface_op_destroy_t destroy;
 } esp_wn_iface_t;
--- a/wake_word_engine/include/esp_wn_models.h
+++ b/wake_word_engine/include/esp_wn_models.h
@ -4,25 +4,19 @@
 //Contains declarations of all available speech recognion models. Pair this up with the right coefficients and you have a model that can recognize
 //a specific phrase or word.

-extern const esp_wn_iface_t esp_sr_wakenet3_quantized;
-extern const esp_wn_iface_t esp_sr_wakenet4_quantized;
 extern const esp_wn_iface_t esp_sr_wakenet5_quantized;
-extern const esp_wn_iface_t esp_sr_wakenet5_float;
-extern const esp_wn_iface_t esp_sr_wakenet6_quantized;
+extern const esp_wn_iface_t esp_sr_wakenet7_quantized;
+extern const esp_wn_iface_t esp_sr_wakenet7_quantized8;

 /*
 Configure network to use based on what's selected in menuconfig.
 */
-#if CONFIG_SR_MODEL_WN3_QUANT
-#define WAKENET_MODEL esp_sr_wakenet3_quantized
-#elif CONFIG_SR_MODEL_WN4_QUANT
-#define WAKENET_MODEL esp_sr_wakenet4_quantized
-#elif CONFIG_SR_MODEL_WN5_FLOAT
-#define WAKENET_MODEL esp_sr_wakenet5_float
-#elif CONFIG_SR_MODEL_WN5_QUANT
+#if CONFIG_SR_MODEL_WN5_QUANT
 #define WAKENET_MODEL esp_sr_wakenet5_quantized
-#elif CONFIG_SR_MODEL_WN6_QUANT
-#define WAKENET_MODEL esp_sr_wakenet6_quantized
+#elif CONFIG_SR_MODEL_WN7_QUANT
+#define WAKENET_MODEL esp_sr_wakenet7_quantized
+#elif CONFIG_SR_MODEL_WN7_QUANT8
+#define WAKENET_MODEL esp_sr_wakenet7_quantized8
 #else
 #error No valid neural network model selected.
 #endif
@ -30,19 +24,7 @@ extern const esp_wn_iface_t esp_sr_wakenet6_quantized;
 /*
 Configure wake word to use based on what's selected in menuconfig.
 */
-#if CONFIG_SR_WN3_HILEXIN 
-#include "hilexin_wn3.h"
-#define WAKENET_COEFF get_coeff_hilexin_wn3
-
-#elif CONFIG_SR_WN4_HILEXIN
-#include "hilexin_wn4.h"
-#define WAKENET_COEFF get_coeff_hilexin_wn4
-
-#elif CONFIG_SR_WN5_HILEXIN & CONFIG_SR_MODEL_WN5_FLOAT
-#include "hilexin_wn5_float.h"
-#define WAKENET_COEFF get_coeff_hilexin_wn5_float
-
-#elif CONFIG_SR_WN5_HILEXIN & CONFIG_SR_MODEL_WN5_QUANT
+#if CONFIG_SR_WN5_HILEXIN & CONFIG_SR_MODEL_WN5_QUANT
 #include "hilexin_wn5.h"
 #define WAKENET_COEFF get_coeff_hilexin_wn5

@ -74,17 +56,37 @@ extern const esp_wn_iface_t esp_sr_wakenet6_quantized;
 #include "hijeson_wn5X3.h"
 #define WAKENET_COEFF get_coeff_hijeson_wn5X3

-#elif CONFIG_SR_WN6_NIHAOXIAOXIN
-#include "nihaoxiaoxin_wn6.h"
-#define WAKENET_COEFF get_coeff_nihaoxiaoxin_wn6
-
 #elif CONFIG_SR_WN5_CUSTOMIZED_WORD
 #include "customized_word_wn5.h"
 #define WAKENET_COEFF get_coeff_customized_word_wn5

-#elif CONFIG_SR_WN6_CUSTOMIZED_WORD
-#include "customized_word_wn6.h"
-#define WAKENET_COEFF get_coeff_customized_word_wn6
+#elif CONFIG_SR_WN7_CUSTOMIZED_WORD
+#include "customized_word_wn7.h"
+#define WAKENET_COEFF get_coeff_customized_word_wn7
+
+#elif CONFIG_SR_WN7_XIAOAITONGXUE & CONFIG_SR_MODEL_WN7_QUANT
+#include "xiaoaitongxue_wn7.h"
+#define WAKENET_COEFF get_coeff_xiaoaitongxue_wn7
+
+#elif CONFIG_SR_WN7_XIAOAITONGXUE & CONFIG_SR_MODEL_WN7_QUANT8
+#include "xiaoaitongxue_wn7_q8.h"
+#define WAKENET_COEFF get_coeff_xiaoaitongxue_wn7_q8
+
+#elif CONFIG_SR_WN7_HILEXIN & CONFIG_SR_MODEL_WN7_QUANT
+#include "hilexin_wn7.h"
+#define WAKENET_COEFF get_coeff_hilexin_wn7
+
+#elif CONFIG_SR_WN7_HILEXIN & CONFIG_SR_MODEL_WN7_QUANT8
+#include "hilexin_wn7_q8.h"
+#define WAKENET_COEFF get_coeff_hilexin_wn7_q8
+
+#elif CONFIG_SR_WN7_ALEXA & CONFIG_SR_MODEL_WN7_QUANT
+#include "alexa_wn7.h"
+#define WAKENET_COEFF get_coeff_alexa_wn7
+
+#elif CONFIG_SR_WN7_ALEXA & CONFIG_SR_MODEL_WN7_QUANT8
+#include "alexa_wn7_q8.h"
+#define WAKENET_COEFF get_coeff_alexa_wn7_q8

 #else
 #error No valid wake word selected.
--- a/wake_word_engine/libwakeword_model.a
+++ b/wake_word_engine/libwakeword_model.a