feat(system): Support esp32s3

2025-09-15 15:28:44 +08:00 · 2021-04-02 14:09:33 +08:00 · 2021-04-02 14:09:33 +08:00 · a78b3d22b1
commit a78b3d22b1
parent 2a7aab136b
55 changed files with 305 additions and 213 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -27,30 +27,23 @@ add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/libwakenet.a" PRIV
 ENDIF (IDF_VER MATCHES "v4.")
 add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/libmultinet.a" PRIV_REQUIRES esp-sr) 

+add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/audio_front_end/libesp_audio_front_end.a" PRIV_REQUIRES esp-sr) 
+
 if(IDF_TARGET STREQUAL "esp32")
 target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
    wakenet
    dl_lib_esp32
    c_speech_features
-    hilexin_wn3
-    hilexin_wn4
    hilexin_wn5
-    hilexin_wn5X2
    hilexin_wn5X3
-    hijeson_wn5X3
-    nihaoxiaozhi_wn5
-    nihaoxiaozhi_wn5X2
-    nihaoxiaozhi_wn5X3
-    nihaoxiaoxin_wn6
-    nihaoxiaoxin_wn5X3
    customized_word_wn5
-    customized_word_wn6
    multinet
-    multinet1_ch
    multinet1_en
+    multinet2_ch
    esp_tts_chinese 
    voice_set_xiaole 
    voice_set_template
+    esp_audio_front_end
    esp_audio_processor "-Wl,--end-group")
 endif()

@ -63,16 +56,19 @@ target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
 endif()


-if(IDF_TARGET STREQUAL "esp32s3beta")
+if(IDF_TARGET STREQUAL "esp32s3")
 target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
    wakenet
-    esp-dsp
+    hilexin_wn7
+    xiaoaitongxue_wn7
+    xiaoaitongxue_wn7_q8
+    customized_word_wn7
    dl_lib_esp32s3
    c_speech_features
-    wakeword_model
-    multinet_model
    esp_audio_front_end
    esp_audio_processor
    multinet
+    multinet3_en
+    multinet3_ch
    "-Wl,--end-group")
 endif()
--- a/Kconfig.projbuild
+++ b/Kconfig.projbuild
@ -1,17 +1,26 @@
 menu "ESP Speech Recognition"

-choice NET_TO_USE_PRELOAD
+choice NET_TO_USE_ACCELERATION
    prompt "Net to use acceleration"
    default WAKENET_USE_PRELOAD
-    depends on IDF_TARGET_ESP32S3BETA
+    depends on IDF_TARGET_ESP32S3
    help
        Select the Wake Word Engine to be used.

 config WAKENET_USE_PRELOAD
-    bool "wakenet"
+    bool "wakenet_preload"

 config MULTINET_USE_PRELOAD
-    bool "multinet"
+    bool "multinet_preload"
+
+config WAKENET_USE_EDMA
+    bool "wakenet_edma"
+
+config MULTINET_USE_EDMA
+    bool "multinet_edma"
+
+config ALL_DISABLE
+    bool "all disable"

 endchoice

@ -23,12 +32,15 @@ choice SR_MODEL_SEL

 config SR_MODEL_WN5_QUANT
 	bool "WakeNet 5 (quantized with 16-bit)"
+    depends on IDF_TARGET_ESP32

 config SR_MODEL_WN7_QUANT
 	bool "WakeNet 7 (quantized with 16-bit)"
+    depends on IDF_TARGET_ESP32S3

 config SR_MODEL_WN7_QUANT8
 	bool "WakeNet 7 (quantized with 8-bit)"
+    depends on IDF_TARGET_ESP32S3

 endchoice

@ -41,98 +53,28 @@ choice SR_WAKE_WORD_SEL

 config SR_WN5_HILEXIN
 	bool "hilexin (WakeNet5)"
-	depends on SR_MODEL_WN5_QUANT  
-
-config SR_WN5X2_HILEXIN
-    bool "hilexin (WakeNet5X2)"
-    depends on SR_MODEL_WN5_QUANT  
+	depends on SR_MODEL_WN5_QUANT || SR_MODEL_WN5_FLOAT 

 config SR_WN5X3_HILEXIN
    bool "hilexin (WakeNet5X3)"
-    depends on SR_MODEL_WN5_QUANT  
+    depends on SR_MODEL_WN5_QUANT || SR_MODEL_WN5_FLOAT 

-config SR_WN5_NIHAOXIAOZHI
-	bool "nihaoxiaozhi (WakeNet5)"
-	depends on SR_MODEL_WN5_QUANT  
-
-config SR_WN5X2_NIHAOXIAOZHI
-	bool "nihaoxiaozhi (WakeNet5X2)"
-	depends on SR_MODEL_WN5_QUANT  
-
-config SR_WN5X3_NIHAOXIAOZHI
-	bool "nihaoxiaozhi (WakeNet5X3)"
-	depends on SR_MODEL_WN5_QUANT  
-
-config SR_WN5X3_HIJESON
-	bool "hi jeson (WakeNet5X3)"
-	depends on SR_MODEL_WN5_QUANT  
-
-config SR_WN5X3_NIHAOXIAOXIN
-    bool "nihaoxiaoxin (WakeNet5X3)"
-    depends on SR_MODEL_WN5_QUANT  
+config SR_WN7_HILEXIN
+    bool "hilexin (WakeNet7)"
+    depends on SR_MODEL_WN7_QUANT

 config SR_WN5_CUSTOMIZED_WORD
 	bool "customized word (WakeNet5)"
-	depends on SR_MODEL_WN5_QUANT  
+	depends on SR_MODEL_WN5_QUANT || SR_MODEL_WN5_FLOAT 

 config SR_WN7_CUSTOMIZED_WORD
 	bool "customized word (WakeNet7)"
-	depends on SR_MODEL_WN7_QUANT || SR_MODEL_WN7_QUANT8
-
-config SR_WN7_HILEXIN
-	bool "hilexin (WakeNet7)"
-	depends on SR_MODEL_WN7_QUANT || SR_MODEL_WN7_QUANT8
+	depends on SR_MODEL_WN7_QUANT 

 config SR_WN7_XIAOAITONGXUE
 	bool "xiaoaitongxue (WakeNet7)"
 	depends on SR_MODEL_WN7_QUANT || SR_MODEL_WN7_QUANT8

-config SR_WN7_ALEXA
-	bool "alexa (WakeNet7)"
-	depends on SR_MODEL_WN7_QUANT || SR_MODEL_WN7_QUANT8
-
-endchoice
-
-choice SR_MN_MODEL_SEL
-    prompt "speech commands recognition model after wake up"
-    default MULTINET1
-    help
-        Select the model to be used.
-
-config MULTINET1
-    depends on SR_MODEL_WN5_QUANT  || SR_MODEL_WN6_QUANT
-    bool "MultiNet 1"
-
-config MULTINET2
-    depends on SR_MODEL_WN7_QUANT || SR_MODEL_WN7_QUANT8
-    bool "MultiNet 2"
-
-config MULTINET3
-    bool "MultiNet 3"
-
-config MULTINET4
-    bool "MultiNet 4"
-
-config MULTINET5
-    bool "MultiNet 5"
-
-config MULTINET6
-    bool "MultiNet 6"
-
-endchoice
-
-choice SR_MN_MODE_SEL
-    prompt "speech commands recognition mode after wake up"
-    default SINGLE_RECOGNITION
-    help
-        Select the mode to be used.
-
-config SINGLE_RECOGNITION
-    bool "Single recognition"
-
-config CONTINUOUS_RECOGNITION
-    bool "Continuous recognition"
-
 endchoice

 choice SR_LANGUAGE_SEL
@ -143,11 +85,37 @@ choice SR_LANGUAGE_SEL

 config SR_CHINESE
    bool "chinese"
-    depends on SINGLE_RECOGNITION || CONTINUOUS_RECOGNITION

 config SR_ENGLISH
    bool "english"
-    depends on SINGLE_RECOGNITION
+
+endchoice
+
+choice SR_MN_MODE_SEL
+    prompt "speech commands recognition model"
+    default CN_MULTINET3_SINGLE_RECOGNITION
+    help
+        Select the model to be used.
+
+config EN_MULTINET3_SINGLE_RECOGNITION
+    bool "english single recognition (MultiNet3)"
+    depends on SR_ENGLISH && IDF_TARGET_ESP32
+
+config CN_MULTINET2_SINGLE_RECOGNITION
+    bool "chinese single recognition (MultiNet2)"
+    depends on SR_CHINESE && IDF_TARGET_ESP32
+
+config CN_MULTINET2_CONTINUOUS_RECOGNITION
+    bool "chinese continuous recognition (MultiNet2)"
+    depends on SR_CHINESE && IDF_TARGET_ESP32
+
+config CN_MULTINET3_SINGLE_RECOGNITION
+    bool "chinese single recognition (MultiNet3)"
+    depends on SR_CHINESE && IDF_TARGET_ESP32S3
+
+config CN_MULTINET3_CONTINUOUS_RECOGNITION
+    bool "chinese continuous recognition (MultiNet3)"
+    depends on SR_CHINESE && IDF_TARGET_ESP32S3

 endchoice

@ -237,7 +205,7 @@ config CN_SPEECH_COMMAND_ID14
 config CN_SPEECH_COMMAND_ID15
    string "ID15"
    depends on SR_CHINESE
-    default "bo fang ge qu"
+    default "kai shi bo fang"

 config CN_SPEECH_COMMAND_ID16
    string "ID16"
--- a/acoustic_algorithm/libesp_audio_processor.a
+++ b/acoustic_algorithm/libesp_audio_processor.a
--- a/audio_front_end/include/esp_afe_sr_iface.h
+++ b/audio_front_end/include/esp_afe_sr_iface.h
@ -12,6 +12,7 @@ typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;

 //Set AFE_SR mode
 typedef enum {
+    SR_MODE_MONO = -1,               //For mono, low memory consumption and CPU loading
 	SR_MODE_LOW_COST = 0,            //LOW_COST, low memory consumption and CPU loading
    SR_MODE_MEDIUM = 1,              //MEDIUM
 	SR_MODE_HIGH_PERF = 2,           //HIGH_PERF
@ -57,10 +58,11 @@ typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
 /**
 * @brief Feed samples of an audio stream to the AFE_SR
 *
- * @Warning  The input data should be arranged in the format of [CH0_0, CH1_0, ..., CHN_0, CH0_1, CH0_1, ..., CHN_1, ...].
+ * @Warning  The input data should be arranged in the format of [CH0_0, CH1_0, ..., CHN_0, CH0_1, CH1_1, ..., CHN_1, ...].
 *           The last channel is reference signal or far-end signal.
 *
- * @param afe   The AFE_SR object to query
+ * @param afe   The AFE_SR object to queryq
+ * 
 * @param in    The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the 
 *              `get_samp_chunksize`. The channel number can be queried `get_channel_num`.
 * @return      The size of input
@ -138,7 +140,9 @@ typedef struct {
    esp_afe_sr_iface_op_create_t create;
    esp_afe_sr_iface_op_feed_t feed;
    esp_afe_sr_iface_op_fetch_t fetch;
-    esp_afe_sr_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    // esp_afe_sr_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
+    esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
    esp_afe_sr_iface_op_get_channel_num_t get_channel_num;
    esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
    esp_afe_sr_iface_op_set_wakenet_t  set_wakenet; 
--- a/audio_front_end/libesp_audio_front_end.a
+++ b/audio_front_end/libesp_audio_front_end.a
--- a/lib/include/dl_lib.h
+++ b/lib/include/dl_lib.h
@ -16,9 +16,16 @@

 #include "dl_lib_matrix.h"
 #include "dl_lib_matrixq.h"
-
+#include "dl_lib_matrixq8.h"

 typedef int padding_state;
+
+#ifdef CONFIG_IDF_TARGET_ESP32S3
+void *dl_lib_calloc(int cnt, int size, int align);
+void *dl_lib_calloc_psram(int cnt, int size, int align);
+
+void dl_lib_free(void *d);
+#endif
 /**
 * @brief Does a fast version of the exp() operation on a floating point number.
 *
@ -237,7 +244,7 @@ void dl_relu_q(const dl_matrix2dq_t *in, fptp_t clip, dl_matrix2dq_t *out);
 * @return Sigmoid output
 */
 int dl_sigmoid_op_q(const int in);
-
+int16_t dl_sigmoid_op_q8(const int16_t in);
 /**
 * @brief Does a sigmoid operation on a matrix, quantized version
 *
@ -263,6 +270,21 @@ void dl_tanh_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
 * @return tanh output
 */
 int dl_tanh_op_q(int v);
+int16_t dl_tanh_op_q8(int16_t v);
+
+
+qtp_t dl_hard_sigmoid_op(qtp_t in, int exponent);
+qtp_t dl_hard_tanh_op(qtp_t in, int exponent);
+
+int16_t dl_table_tanh_op(int16_t in, int exponent);
+int16_t dl_table_sigmoid_op(int16_t in, int exponent);
+
+void dl_hard_sigmoid_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+void dl_hard_tanh_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+void dl_table_sigmoid_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+void dl_table_tanh_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+

 /**
 * @brief Filter out the number greater than clip in the matrix, quantized version
--- a/lib/include/dl_lib_coefgetter_if.h
+++ b/lib/include/dl_lib_coefgetter_if.h
@ -16,6 +16,7 @@

 #include "dl_lib_matrix.h"
 #include "dl_lib_matrixq.h"
+#include "dl_lib_matrixq8.h"

 //Set this if the coefficient requested is a batch-normalization popvar matrix which needs to be preprocessed by
 //dl_batch_normalize_get_sqrtvar first.
@ -58,8 +59,10 @@ memory for the returned matrices, when applicable.
 typedef struct {
    const dl_matrix2d_t* (*getter_f)(const char *name, void *arg, int hint);
    const dl_matrix2dq_t* (*getter_q)(const char *name, void *arg, int hint);
+    const dl_matrix2dq8_t* (*getter_q8)(const char *name, void *arg, int hint);
    void (*free_f)(const dl_matrix2d_t *m);
    void (*free_q)(const dl_matrix2dq_t *m);
+    void (*free_q8)(const dl_matrix2dq8_t *m);
    const model_info_t* (*getter_info)(void *arg);
    const alphabet_t* (*getter_alphabet)(void *arg);
 } model_coeff_getter_t;
--- a/lib/include/dl_lib_convq_queue.h
+++ b/lib/include/dl_lib_convq_queue.h
@ -14,9 +14,9 @@
 #ifndef DL_LIB_CONVQ_QUEUE_H
 #define DL_LIB_CONVQ_QUEUE_H

-
 #include "dl_lib_matrixq.h"
 #include "dl_lib_conv_queue.h"
+#include "dl_lib.h"

 //fixed-point convolution FIFO queue. 
 typedef struct {
@ -54,8 +54,8 @@ void dl_convq_queue_free(dl_convq_queue_t *cq);
 * @param cq    Input fixed-point convolution queue
 * @return      Pointer of oldest element  
 */
-qtp_t *dl_convq_queue_pop(dl_convq_queue_t *cq);
-
+inline qtp_t *dl_convq_queue_pop(dl_convq_queue_t *cq);
+inline qtp_t *dl_convq_queue_popn(dl_convq_queue_t *cq, int n);
 /**
 * @brief  Remove the oldest element, then insert the input element at the end of queue
 *
@ -75,14 +75,18 @@ void dl_convq_queue_push(dl_convq_queue_t *cq, dl_matrix2dq_t *a, int shift);
 */
 void dl_convq_queue_push_by_qmf(dl_convq_queue_t *cq, fptp_t* item, int m_bit, int f_bit);

+void dl_convq16_queue_push_by_qmf(dl_convq_queue_t *cq, fptp_t* item, int m_bit, int f_bit);
+
+dl_conv_queue_t *dl_queue_from_convq(dl_convq_queue_t *cq1);
+
 /**
 * @brief   Get the pointer of element in the queue by offset
 *
- * @param cq      Input fixed-point convolution queue
- * @param offset  Offset from the front of the queue
- * @return        Pointer of the element
+ * @param cq        Input fixed-point convolution queue
+ * @param last_num  Offset from the front of the queue
+ * @return          Pointer of the element
 */
-qtp_t *dl_get_queue_itemq(dl_convq_queue_t *cq, int offset);
+inline qtp_t *dl_get_queue_itemq(dl_convq_queue_t *cq, int last_num);

 /**
 * @brief   Does a tanh operation on the one of element in the convolution queue.
@ -93,7 +97,19 @@ qtp_t *dl_get_queue_itemq(dl_convq_queue_t *cq, int offset);
 * @param offset  Offset from the front of the queue
 * @return        Pointer of the element
 */
-void dl_tanh_convq(dl_convq_queue_t *cq, int last_num);
+void dl_tanh_convq(dl_convq_queue_t *cq, int offset);
+
+/**
+ * @brief   Does a tanh operation on the one of element in multi channel convolution queue.
+ *          Gets the pointer of element in the convolution queue by offset, and does a 
+ *          tanh operation by this pointer, then return the pointer 
+ *
+ * @param cq      Input fixed-point multi channnel convolution queue
+ * @param offset  Offset from the front of the queue
+ * @param nch     The channel number of cqm
+ * @return        Pointer of the element
+ */
+void dl_tanh_convq_mc(dl_convq_queue_t **cqm, int offset, int nch);

 /**
 * @brief   Does a relu operation on the one of element in the convolution queue.
@ -134,10 +150,9 @@ fptp_t * dl_softmax_step_q(dl_convq_queue_t *cq, int offset, fptp_t *out);
 * @param shift    Shift ratio used in dot operation between two 16-bit fixed point vector 
 * @return         The result of atrous convolution
 */
-qtp_t *dl_atrous_conv1dq(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
-                             dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int shift);
-qtp_t *dl_atrous_conv1dq_steps(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
-                             dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int shift, int offset);
+qtp_t * dl_atrous_conv1dq(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                          dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int shift, int prenum);
+
 /**
 * @brief Fast implement of dilation layer as follows
 *
@ -156,19 +171,154 @@ qtp_t *dl_atrous_conv1dq_steps(dl_convq_queue_t *in, dl_convq_queue_t *out, int
 * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
 * @param gate_kernel     The kernel matrix of gate
 * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
- * @filter_shift          Shift ratio used in filter operation between two 16-bit fixed point vector
- * @gate_shift            Shift ratio used in gate operation between two 16-bit fixed point vector
+ * @param filter_shift          Shift ratio used in filter operation between two 16-bit fixed point vector
+ * @param gate_shift            Shift ratio used in gate operation between two 16-bit fixed point vector
 * @return                The result of dilation layer
 */
+qtp_t *dl_dilation_layerq_steps(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+   dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+   dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias,
+   int filter_shift, int gate_shift, int offset, int prenum);
+
+
 qtp_t *dl_dilation_layerq(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
                          dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
-                          dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias, 
-                          int filter_shift, int gate_shift);
+                          dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias,
+                          int filter_shift, int gate_shift, int prenum);

-dl_matrix2dq_t *dl_basic_lstm_layer1_q(const dl_convq_queue_t *in, dl_matrix2dq_t *state_c, dl_matrix2dq_t *state_h,
-   const dl_matrix2dq_t *weight, const dl_matrix2dq_t *bias, int step, int shift);
-void test_atrous_convq(int size, int rate, int in_channel, int out_channel);
+qtp_t *dl_dilation_layerq16(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                             dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+                             dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias, int prenum);

+
+qtp_t *dl_atrous_conv1dq_steps(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                                 dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int shift, int offset, int prenum);
+
+/**
+ * @brief Add a pair of fixed-point convolution queue item-by-item, and return float-point convolution queue
+ *
+ * @param cq1      First fixed-point convolution queue
+ * @param cq2      Seconf fixed-point convolution queue
+ * @return         The result of float-point convolution queue
+ */
 dl_conv_queue_t *dl_convq_queue_add(dl_convq_queue_t *cq1, dl_convq_queue_t *cq2);

+/**
+ * @brief Fast implement of LSTM layer by dl_atrous_conv1dq function
+ *
+ * @Warning LSTM kernel is split into two part, the first part input is the last layer output, 
+ *           and kernel is parameter *in_weight*. The second part input is the last frame LSTM output,
+ *           the kernel is parameters *h_weight*.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param state_c         Internal state of the LSTM network
+ * @param state_h         Internal state (previous output values) of the LSTM network
+ * @param in_weight       the LSTM kernel needed by first part
+ * @param h_weight        the LSTM kernel needed by second part
+ * @param bias            The bias matrix of LSTM. Can be NULL if a bias of 0 is required.
+ * @in_shift              Shift ratio used in first part
+ * @h_shift               Shift ratio used in second part
+ * @return                The result of LSTM layer
+ */
+dl_matrix2dq_t *dl_convq_lstm_layer(const dl_convq_queue_t *in, dl_convq_queue_t *out, dl_matrix2dq_t *state_c,
+                                    dl_matrix2dq_t *state_h, const dl_matrix2dq_t *in_weight, const dl_matrix2dq_t *h_weight,
+                                    const dl_matrix2dq_t *bias, int in_shift, int h_shift, int prenum);
+dl_matrix2dq_t *dl_basic_lstm_layer1_q(const dl_convq_queue_t *in, dl_matrix2dq_t *state_c, dl_matrix2dq_t *state_h,
+                                       const dl_matrix2dq_t *weight, const dl_matrix2dq_t *bias, int step, int shift);
+
+dl_matrix2dq_t *dl_convq16_lstm_layer(const dl_convq_queue_t *in, dl_convq_queue_t *out, dl_matrix2dq_t *state_c,
+                                       dl_matrix2dq_t *state_h, const dl_matrix2dq_t *in_weight, const dl_matrix2dq_t *h_weight,
+                                       const dl_matrix2dq_t *bias, int prenum);
+
+/**
+ * @brief Allocate a fixed-point multi channel convolution queue 
+ *
+ * @param n     The length of queue
+ * @param c     The channel number of elements in the queue
+ * @param nch   the channel numbet of convolution queue 
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq_queue_t **dl_convq_queue_mc_alloc(int n, int c, int nch);
+
+/**
+ * @brief Free a fixed-point multi channel convolution queue
+ *
+ * @param cqm     The fixed-point convolution queue to free
+ * @param nch     The channel number of cqm
+ */
+void dl_convq_queue_mc_free(dl_convq_queue_t **cqm, int nch);
+
+/**
+ * @brief Fast and quantised implement for 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        based on convolution queue.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in       Input fixed-point convolution queue
+ * @param out      Output fixed-point convolution queue
+ * @param nch      The channel number of input 
+ * @param rate     A positive int, the stride with which we sample input value 
+ * @param size     A positive int, the size of 1D-filter
+ * @param kernel   The kernel matrix of filter
+ * @param bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param shift    Shift ratio used in dot operation between two 16-bit fixed point vector 
+ * @param offset   the offset to calculate input convq
+ * @param prenum   the preload size, 0: do not use preload function
+ * @return         The result of atrous convolution
+ */
+qtp_t *dl_atrous_conv1dq_mc_steps(  dl_convq_queue_t **in,
+                                    dl_convq_queue_t **out,
+									int nch,
+									int rate,
+									int size,
+                                    dl_matrix2dq_t* kernel,
+									dl_matrix2dq_t* bias,
+									int shift,
+									int offset,
+									int prenum);
+
+/**
+ * @brief Fast implement of dilation layer as follows for multi channel input
+ *
+ *               |-> [gate(sigmoid)] -|        
+ *      input -  |                    |-> (*) - output
+ *               |-> [filter(tanh)]  -|   
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param nch             The channel number of input 
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param filter_kernel   The kernel matrix of filter
+ * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param gate_kernel     The kernel matrix of gate
+ * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
+ * @param filter_shift    Shift ratio used in filter operation between two 16-bit fixed point vector
+ * @param gate_shift      Shift ratio used in gate operation between two 16-bit fixed point vector
+ * @param offset          The offset to calculate input convq
+ * @param prenum          The preload size, 0: do not use preload function
+ * @return                The result of dilation layer
+ */
+qtp_t *dl_dilation_layerq_mc_steps( dl_convq_queue_t **in, 
+									dl_convq_queue_t **out,
+									int nch,
+									int rate,
+									int size,
+                                    dl_matrix2dq_t* filter_kernel,
+									dl_matrix2dq_t* filter_bias,
+                                    dl_matrix2dq_t* gate_kernel,
+									dl_matrix2dq_t* gate_bias,
+                                    int filter_shift,
+									int gate_shift,
+									int offset,
+									int prenum);
+
+void test_atrous_convq(int size, int rate, int in_channel, int out_channel);
+void test_lstm_convq(int size, int in_dim, int lstm_cell);
+
 #endif
--- a/lib/include/dl_lib_matrix.h
+++ b/lib/include/dl_lib_matrix.h
@ -170,6 +170,7 @@ void dl_matrix_add_const(dl_matrix2d_t *subj, const fptp_t add);
 */
 dl_matrix2d_t *dl_matrix_concat(const dl_matrix2d_t *a, const dl_matrix2d_t *b);

+dl_matrix2d_t *dl_matrix_concat_h( dl_matrix2d_t *a, const dl_matrix2d_t *b);

 /**
 * @brief Print the contents of a matrix to stdout. Used for debugging.
@ -230,7 +231,7 @@ inline static void dl_matrix_set(dl_matrix2d_t *m, const int x, const int y, fpt
    DL_ITM(m, x, y)=val;
 }

-
+void matrix_get_range(const dl_matrix2d_t *m, fptp_t *rmin, fptp_t *rmax);

 #endif

--- a/lib/include/dl_lib_matrixq.h
+++ b/lib/include/dl_lib_matrixq.h
@ -98,7 +98,7 @@ typedef struct {
 * @return The matrix, or NULL if out of memory
 */
 dl_matrix2dq_t *dl_matrixq_alloc(int w, int h);
-
+dl_matrix2dq_t *dl_matrixq_alloc_psram(int w, int h);
 /**
 * @brief Convert a floating-point matrix to a quantized matrix
 *
@ -108,7 +108,6 @@ dl_matrix2dq_t *dl_matrixq_alloc(int w, int h);
 */
 dl_matrix2dq_t *dl_matrixq_from_matrix2d(const dl_matrix2d_t *m, dl_matrix2dq_t *out);

-
 /**
 * TODO: DESCRIBE THIS FUNCTION
 */
@ -270,7 +269,7 @@ void dl_matrixq_sub(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2
 * @param b     Second multiplicand
 * @param res   Multiplicated data. Can be equal to a or b to overwrite that matrix.
 */
-void dl_matrixq_mul(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res);
+void dl_matrixq_mul( dl_matrix2dq_t *a,  dl_matrix2dq_t *b, dl_matrix2dq_t *res);

 /**
 * @brief Divide a pair of quantized matrices item-by-item: res=a/b
--- a/lib/libc_speech_features.a
+++ b/lib/libc_speech_features.a
--- a/lib/libdl_lib_esp32.a
+++ b/lib/libdl_lib_esp32.a
--- a/lib/libdl_lib_esp32s3.a
+++ b/lib/libdl_lib_esp32s3.a
--- a/lib/libesp-dsp.a
+++ b/lib/libesp-dsp.a
--- a/lib/libmultinet.a
+++ b/lib/libmultinet.a
--- a/lib/libwakenet.a
+++ b/lib/libwakenet.a
--- a/speech_command_recognition/include/esp_mn_models.h
+++ b/speech_command_recognition/include/esp_mn_models.h
@ -3,30 +3,38 @@

 //Contains declarations of all available speech recognion models. Pair this up with the right coefficients and you have a model that can recognize
 //a specific phrase or word.
-extern const esp_mn_iface_t esp_sr_multinet1_single_quantized_cn;
-extern const esp_mn_iface_t esp_sr_multinet1_continuous_quantized_cn;
 extern const esp_mn_iface_t esp_sr_multinet1_single_quantized_en;
+extern const esp_mn_iface_t esp_sr_multinet3_single_quantized_en;
 extern const esp_mn_iface_t esp_sr_multinet2_single_quantized_cn;
+extern const esp_mn_iface_t esp_sr_multinet3_single_quantized_cn;
+extern const esp_mn_iface_t esp_sr_multinet3_continuous_quantized_cn;

 /*
 Configure wake word to use based on what's selected in menuconfig.
 */
-#if CONFIG_MULTINET1 && CONFIG_SR_CHINESE && CONFIG_SINGLE_RECOGNITION
-#include "multinet1_ch.h"
-#define MULTINET_MODEL esp_sr_multinet1_single_quantized_cn
-#define MULTINET_COEFF get_coeff_multinet1_ch
-#elif CONFIG_MULTINET1 && CONFIG_SR_CHINESE && CONFIG_CONTINUOUS_RECOGNITION
-#include "multinet1_ch.h"
-#define MULTINET_MODEL esp_sr_multinet1_continuous_quantized_cn
-#define MULTINET_COEFF get_coeff_multinet1_ch
-#elif CONFIG_MULTINET1 && CONFIG_SR_ENGLISH && CONFIG_SINGLE_RECOGNITION
+
+#ifdef CONFIG_EN_MULTINET1_SINGLE_RECOGNITION
 #include "multinet1_en.h"
 #define MULTINET_MODEL esp_sr_multinet1_single_quantized_en
 #define MULTINET_COEFF get_coeff_multinet1_en
-#elif CONFIG_MULTINET2 && CONFIG_SR_CHINESE && CONFIG_SINGLE_RECOGNITION
+#elif CONFIG_EN_MULTINET3_SINGLE_RECOGNITION
+#include "multinet3_en.h"
+#define MULTINET_MODEL esp_sr_multinet3_single_quantized_en
+#define MULTINET_COEFF get_coeff_multinet3_en
+#elif CONFIG_CN_MULTINET2_SINGLE_RECOGNITION
 #include "multinet2_ch.h"
 #define MULTINET_MODEL esp_sr_multinet2_single_quantized_cn
 #define MULTINET_COEFF get_coeff_multinet2_ch
+#elif CONFIG_CN_MULTINET2_CONTINUOUS_RECOGNITION
+#error No valid wake word selected.
+#elif CONFIG_CN_MULTINET3_SINGLE_RECOGNITION
+#include "multinet3_ch.h"
+#define MULTINET_MODEL esp_sr_multinet3_single_quantized_cn
+#define MULTINET_COEFF get_coeff_multinet3_ch
+#elif CONFIG_CN_MULTINET3_CONTINUOUS_RECOGNITION
+#include "multinet3_ch.h"
+#define MULTINET_MODEL esp_sr_multinet3_continuous_quantized_cn
+#define MULTINET_COEFF get_coeff_multinet3_ch
 #else
 #error No valid wake word selected.
 #endif
--- a/speech_command_recognition/include/multinet1_en.h
+++ b/speech_command_recognition/include/multinet1_en.h
@ -1,8 +1,9 @@
-//Generated by mkmodel
+//Generated by mkmodel_py
 #pragma once
 #include <string.h>
 #include "dl_lib_coefgetter_if.h"
 #include "dl_lib_matrix.h"
 #include "dl_lib_matrixq.h"
+#include "dl_lib_matrixq8.h"

-extern const model_coeff_getter_t get_coeff_multinet1_en;
+extern const model_coeff_getter_t get_coeff_multinet1_en;
--- a/speech_command_recognition/include/multinet3_ch.h
+++ b/speech_command_recognition/include/multinet3_ch.h
@ -1,8 +1,9 @@
-//Generated by mkmodel
+//Generated by mkmodel_py
 #pragma once
 #include <string.h>
 #include "dl_lib_coefgetter_if.h"
 #include "dl_lib_matrix.h"
 #include "dl_lib_matrixq.h"
+#include "dl_lib_matrixq8.h"

-extern const model_coeff_getter_t get_coeff_customized_word_wn6;
+extern const model_coeff_getter_t get_coeff_multinet3_ch;
--- a/speech_command_recognition/include/multinet3_en.h
+++ b/speech_command_recognition/include/multinet3_en.h
@ -1,8 +1,9 @@
-//Generated by mkmodel
+//Generated by mkmodel_py
 #pragma once
 #include <string.h>
 #include "dl_lib_coefgetter_if.h"
 #include "dl_lib_matrix.h"
 #include "dl_lib_matrixq.h"
+#include "dl_lib_matrixq8.h"

-extern const model_coeff_getter_t get_coeff_multinet1_ch;
+extern const model_coeff_getter_t get_coeff_multinet3_en;
--- a/speech_command_recognition/libmultinet1_ch.a
+++ b/speech_command_recognition/libmultinet1_ch.a
--- a/speech_command_recognition/libmultinet1_en.a
+++ b/speech_command_recognition/libmultinet1_en.a
--- a/speech_command_recognition/libmultinet2_ch.a
+++ b/speech_command_recognition/libmultinet2_ch.a
--- a/speech_command_recognition/libmultinet3_ch.a
+++ b/speech_command_recognition/libmultinet3_ch.a
--- a/speech_command_recognition/libmultinet3_en.a
+++ b/speech_command_recognition/libmultinet3_en.a
--- a/speech_command_recognition/libmultinet_model.a
+++ b/speech_command_recognition/libmultinet_model.a
--- a/wake_word_engine/include/hilexin_wn4.h
+++ b/wake_word_engine/include/hilexin_wn4.h
@ -1,8 +0,0 @@
-//Generated by mkmodel
-#pragma once
-#include <string.h>
-#include "dl_lib_coefgetter_if.h"
-#include "dl_lib_matrix.h"
-#include "dl_lib_matrixq.h"
-
-extern const model_coeff_getter_t get_coeff_hilexin_wn4;
--- a/wake_word_engine/include/hilexin_wn5X2.h
+++ b/wake_word_engine/include/hilexin_wn5X2.h
@ -1,8 +0,0 @@
-//Generated by mkmodel
-#pragma once
-#include <string.h>
-#include "dl_lib_coefgetter_if.h"
-#include "dl_lib_matrix.h"
-#include "dl_lib_matrixq.h"
-
-extern const model_coeff_getter_t get_coeff_hilexin_wn5X2;
--- a/wake_word_engine/include/hilexin_wn6.h
+++ b/wake_word_engine/include/hilexin_wn6.h
@ -1,8 +0,0 @@
-//Generated by mkmodel
-#pragma once
-#include <string.h>
-#include "dl_lib_coefgetter_if.h"
-#include "dl_lib_matrix.h"
-#include "dl_lib_matrixq.h"
-
-extern const model_coeff_getter_t get_coeff_hilexin_wn6;
--- a/wake_word_engine/include/nihaoxiaoxin_wn5X3.h
+++ b/wake_word_engine/include/nihaoxiaoxin_wn5X3.h
@ -1,8 +0,0 @@
-//Generated by mkmodel
-#pragma once
-#include <string.h>
-#include "dl_lib_coefgetter_if.h"
-#include "dl_lib_matrix.h"
-#include "dl_lib_matrixq.h"
-
-extern const model_coeff_getter_t get_coeff_nihaoxiaoxin_wn5X3;
--- a/wake_word_engine/include/nihaoxiaoxin_wn6.h
+++ b/wake_word_engine/include/nihaoxiaoxin_wn6.h
@ -1,8 +0,0 @@
-//Generated by mkmodel
-#pragma once
-#include <string.h>
-#include "dl_lib_coefgetter_if.h"
-#include "dl_lib_matrix.h"
-#include "dl_lib_matrixq.h"
-
-extern const model_coeff_getter_t get_coeff_nihaoxiaoxin_wn6;
--- a/wake_word_engine/include/nihaoxiaozhi_wn5.h
+++ b/wake_word_engine/include/nihaoxiaozhi_wn5.h
@ -1,8 +0,0 @@
-//Generated by mkmodel
-#pragma once
-#include <string.h>
-#include "dl_lib_coefgetter_if.h"
-#include "dl_lib_matrix.h"
-#include "dl_lib_matrixq.h"
-
-extern const model_coeff_getter_t get_coeff_nihaoxiaozhi_wn5;
--- a/wake_word_engine/include/nihaoxiaozhi_wn5X2.h
+++ b/wake_word_engine/include/nihaoxiaozhi_wn5X2.h
@ -1,8 +0,0 @@
-//Generated by mkmodel
-#pragma once
-#include <string.h>
-#include "dl_lib_coefgetter_if.h"
-#include "dl_lib_matrix.h"
-#include "dl_lib_matrixq.h"
-
-extern const model_coeff_getter_t get_coeff_nihaoxiaozhi_wn5X2;
--- a/wake_word_engine/include/nihaoxiaozhi_wn5X3.h
+++ b/wake_word_engine/include/nihaoxiaozhi_wn5X3.h
@ -1,8 +0,0 @@
-//Generated by mkmodel
-#pragma once
-#include <string.h>
-#include "dl_lib_coefgetter_if.h"
-#include "dl_lib_matrix.h"
-#include "dl_lib_matrixq.h"
-
-extern const model_coeff_getter_t get_coeff_nihaoxiaozhi_wn5X3;
--- a/wake_word_engine/include/xiaoaitongxue_wn7.h
+++ b/wake_word_engine/include/xiaoaitongxue_wn7.h
@ -1,8 +1,9 @@
-//Generated by mkmodel
+//Generated by mkmodel_py
 #pragma once
 #include <string.h>
 #include "dl_lib_coefgetter_if.h"
 #include "dl_lib_matrix.h"
 #include "dl_lib_matrixq.h"
+#include "dl_lib_matrixq8.h"

-extern const model_coeff_getter_t get_coeff_hijeson_wn5X3;
+extern const model_coeff_getter_t get_coeff_xiaoaitongxue_wn7;
--- a/wake_word_engine/include/xiaoaitongxue_wn7_q8.h
+++ b/wake_word_engine/include/xiaoaitongxue_wn7_q8.h
@ -1,8 +1,9 @@
-//Generated by mkmodel
+//Generated by mkmodel_py
 #pragma once
 #include <string.h>
 #include "dl_lib_coefgetter_if.h"
 #include "dl_lib_matrix.h"
 #include "dl_lib_matrixq.h"
+#include "dl_lib_matrixq8.h"

-extern const model_coeff_getter_t get_coeff_hilexin_wn3;
+extern const model_coeff_getter_t get_coeff_xiaoaitongxue_wn7_q8;
--- a/wake_word_engine/libcustomized_word_wn5.a
+++ b/wake_word_engine/libcustomized_word_wn5.a
--- a/wake_word_engine/libcustomized_word_wn6.a
+++ b/wake_word_engine/libcustomized_word_wn6.a
--- a/wake_word_engine/libcustomized_word_wn7.a
+++ b/wake_word_engine/libcustomized_word_wn7.a
--- a/wake_word_engine/libhijeson_wn5X3.a
+++ b/wake_word_engine/libhijeson_wn5X3.a
--- a/wake_word_engine/libhilexin_wn3.a
+++ b/wake_word_engine/libhilexin_wn3.a
--- a/wake_word_engine/libhilexin_wn4.a
+++ b/wake_word_engine/libhilexin_wn4.a
--- a/wake_word_engine/libhilexin_wn5.a
+++ b/wake_word_engine/libhilexin_wn5.a
--- a/wake_word_engine/libhilexin_wn5X2.a
+++ b/wake_word_engine/libhilexin_wn5X2.a
--- a/wake_word_engine/libhilexin_wn5X3.a
+++ b/wake_word_engine/libhilexin_wn5X3.a
--- a/wake_word_engine/libhilexin_wn6.a
+++ b/wake_word_engine/libhilexin_wn6.a
--- a/wake_word_engine/libhilexin_wn7.a
+++ b/wake_word_engine/libhilexin_wn7.a
--- a/wake_word_engine/libnihaoxiaoxin_wn5X3.a
+++ b/wake_word_engine/libnihaoxiaoxin_wn5X3.a
--- a/wake_word_engine/libnihaoxiaoxin_wn6.a
+++ b/wake_word_engine/libnihaoxiaoxin_wn6.a
--- a/wake_word_engine/libnihaoxiaozhi_wn5.a
+++ b/wake_word_engine/libnihaoxiaozhi_wn5.a
--- a/wake_word_engine/libnihaoxiaozhi_wn5X2.a
+++ b/wake_word_engine/libnihaoxiaozhi_wn5X2.a
--- a/wake_word_engine/libnihaoxiaozhi_wn5X3.a
+++ b/wake_word_engine/libnihaoxiaozhi_wn5X3.a
--- a/wake_word_engine/libwakeword_model.a
+++ b/wake_word_engine/libwakeword_model.a
--- a/wake_word_engine/libxiaoaitongxue_wn7.a
+++ b/wake_word_engine/libxiaoaitongxue_wn7.a
--- a/wake_word_engine/libxiaoaitongxue_wn7_q8.a
+++ b/wake_word_engine/libxiaoaitongxue_wn7_q8.a