diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 28c058d..b65bb86 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -49,6 +49,9 @@ before_script:
 
 .patterns-test_esp_sr: &patterns-test_esp32c5
   - "lib/esp32c5/*"
+  - "lib/esp32c3/*"
+  - "lib/esp32c6/*"
+  - "lib/esp32s2/*"
   - "include/esp32c5/*"
   - "src/**/*"
   - "model/**/*"
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a1cef47..37d0e16 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,11 @@
 # Change log for esp-sr
 
+## unreleased
+- esp32c3 support wakenet9s and aec
+- esp32c5 support wakenet9s and aec
+- esp32c6 support wakenet9s and aec
+- esp32s2 support wakenet9s and aec
+
 ## 2.0.5
 - Fix fftr bug
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index d7a3bd8..cfba116 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -69,8 +69,7 @@ if((${IDF_TARGET} STREQUAL "esp32s3") OR (${IDF_TARGET} STREQUAL "esp32p4") OR (
         "-Wl,--end-group")
 
 
-elseif(${IDF_TARGET} STREQUAL "esp32c5")
-
+elseif((${IDF_TARGET} STREQUAL "esp32c5") OR (${IDF_TARGET} STREQUAL "esp32c3") OR (${IDF_TARGET} STREQUAL "esp32c6") OR  (${IDF_TARGET} STREQUAL "esp32s2"))
     set(srcs
         "src/model_path.c"
     )
@@ -78,6 +77,7 @@ elseif(${IDF_TARGET} STREQUAL "esp32c5")
     set(include_dirs
         "include/${IDF_TARGET}"
         "src/include"
+        "esp-tts/esp_tts_chinese/include"
     )
 
     set(requires
@@ -99,6 +99,8 @@ elseif(${IDF_TARGET} STREQUAL "esp32c5")
     add_prebuilt_library(c_speech_features "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libc_speech_features.a" PRIV_REQUIRES ${COMPONENT_NAME})
     add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME})
     add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
+    add_prebuilt_library(esp_tts_chinese "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libesp_tts_chinese.a" PRIV_REQUIRES ${COMPONENT_NAME})
+    add_prebuilt_library(voice_set_xiaole "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libvoice_set_xiaole.a" PRIV_REQUIRES ${COMPONENT_NAME})
 
     target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_processor)
     target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_front_end)
@@ -106,30 +108,8 @@ elseif(${IDF_TARGET} STREQUAL "esp32c5")
     target_link_libraries(${COMPONENT_LIB} PRIVATE c_speech_features)
     target_link_libraries(${COMPONENT_LIB} PRIVATE hufzip)
     target_link_libraries(${COMPONENT_LIB} PRIVATE wakenet)
-
-elseif((${IDF_TARGET} STREQUAL "esp32s2") OR (${IDF_TARGET} STREQUAL "esp32c3") OR (${IDF_TARGET} STREQUAL "esp32c6"))
-#Only support TTS on esp32s2, esp32c3 and esp32c6
-
-set(requires
-    spiffs
-    )
-
-IF (IDF_VERSION_MAJOR GREATER 4)
-    list(APPEND requires esp_partition)
-ENDIF (IDF_VERSION_MAJOR GREATER 4)
-
-idf_component_register(SRCS .
-                INCLUDE_DIRS  esp-tts/esp_tts_chinese/include
-                REQUIRES ${requires}
-                PRIV_REQUIRES spi_flash)
-
-target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}")
-add_prebuilt_library(esp_tts_chinese "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libesp_tts_chinese.a" PRIV_REQUIRES ${COMPONENT_NAME})
-add_prebuilt_library(voice_set_xiaole "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libvoice_set_xiaole.a" PRIV_REQUIRES ${COMPONENT_NAME})
-target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group"
-    esp_tts_chinese
-    voice_set_xiaole
-    "-Wl,--end-group")
+    target_link_libraries(${COMPONENT_LIB} PRIVATE esp_tts_chinese)
+    target_link_libraries(${COMPONENT_LIB} PRIVATE voice_set_xiaole)
 
 endif()
 
diff --git a/Kconfig.projbuild b/Kconfig.projbuild
index 3de3e02..c10ad0d 100644
--- a/Kconfig.projbuild
+++ b/Kconfig.projbuild
@@ -54,7 +54,7 @@ endchoice
 
 
 menu "Load Multiple Wake Words"
-    depends on IDF_TARGET_ESP32C5 ||  IDF_TARGET_ESP32C3
+    depends on IDF_TARGET_ESP32C5 ||  IDF_TARGET_ESP32C3 || IDF_TARGET_ESP32C6 || IDF_TARGET_ESP32S2
 
     config SR_WN_WN9S_HILEXIN
     bool "Hi,乐鑫 (wn9s_hilexin)"
diff --git a/esp-tts/esp_tts_chinese/esp32c5/libesp_tts_chinese.a b/esp-tts/esp_tts_chinese/esp32c5/libesp_tts_chinese.a
new file mode 100644
index 0000000..8677b51
Binary files /dev/null and b/esp-tts/esp_tts_chinese/esp32c5/libesp_tts_chinese.a differ
diff --git a/esp-tts/esp_tts_chinese/esp32c5/libvoice_set_xiaole.a b/esp-tts/esp_tts_chinese/esp32c5/libvoice_set_xiaole.a
new file mode 100644
index 0000000..4d274fe
Binary files /dev/null and b/esp-tts/esp_tts_chinese/esp32c5/libvoice_set_xiaole.a differ
diff --git a/include/esp32c3/c_speech_features_config.h b/include/esp32c3/c_speech_features_config.h
new file mode 100644
index 0000000..e21e020
--- /dev/null
+++ b/include/esp32c3/c_speech_features_config.h
@@ -0,0 +1,29 @@
+#pragma once
+#include <float.h>
+#include <math.h>
+
+/* #undef ENABLE_DOUBLE */
+
+#ifdef ENABLE_DOUBLE
+# define csf_float double
+# define csf_ceil ceil
+# define csf_floor floor
+# define csf_sin sin
+# define csf_log log
+# define csf_log10 log10
+# define csf_pow pow
+# define csf_sqrt sqrt
+# define csf_abs fabs
+# define csf_float_min DBL_MIN
+#else
+# define csf_float float
+# define csf_ceil ceilf
+# define csf_floor floorf
+# define csf_sin sinf
+# define csf_log logf
+# define csf_log10 log10f
+# define csf_pow powf
+# define csf_sqrt sqrtf
+# define csf_abs fabsf
+# define csf_float_min FLT_MIN
+#endif
diff --git a/include/esp32c3/dl_lib.h b/include/esp32c3/dl_lib.h
new file mode 100644
index 0000000..47e7c86
--- /dev/null
+++ b/include/esp32c3/dl_lib.h
@@ -0,0 +1,418 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_H
+#define DL_LIB_H
+
+#include "dl_lib_matrix.h"
+#include "dl_lib_matrixq.h"
+#include "dl_lib_matrixq8.h"
+
+#ifdef ESP_PLATFORM
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+#include "freertos/queue.h"
+#include "esp_system.h"
+#include "esp_heap_caps.h"
+#include "sdkconfig.h"
+#define DL_SPIRAM_SUPPORT 1
+#endif
+
+#ifdef CONFIG_IDF_TARGET_ESP32S3
+#include "esp32s3/rom/cache.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int padding_state;
+
+// /**
+//  * @brief Allocate a chunk of memory which has the given capabilities.
+//  *        Equivalent semantics to libc malloc(), for capability-aware memory.
+//  *        In IDF, malloc(p) is equivalent to heap_caps_malloc(p, MALLOC_CAP_8BIT).
+//  * 
+//  * @param size  In bytes, of the amount of memory to allocate
+//  * @param caps  Bitwise OR of MALLOC_CAP_* flags indicating the type of memory to be returned
+//  *              MALLOC_CAP_SPIRAM:   Memory must be in SPI RAM
+//  *              MALLOC_CAP_INTERNAL: Memory must be internal; specifically it should not disappear when flash/spiram cache is switched off
+//  *              MALLOC_CAP_DMA:      Memory must be able to accessed by DMA
+//  *              MALLOC_CAP_DEFAULT:  Memory can be returned in a non-capability-specific memory allocation
+//  * @return Pointer to currently allocated heap memory
+//  **/
+// void *heap_caps_malloc(size_t size, uint32_t caps);
+
+/**
+ * @brief Allocate aligned memory from internal memory or external memory.
+ *        if cnt*size > CONFIG_SPIRAM_MALLOC_ALWAYSINTERNAL, allocate memory from internal RAM
+ *        else, allocate memory from PSRAM
+ *
+ * @param cnt    Number of continuing chunks of memory to allocate
+ * @param size   Size, in bytes, of a chunk of memory to allocate     
+ * @param align  Aligned size, in bits
+ * @return Pointer to currently allocated heap memory
+ */
+void *dl_lib_calloc(int cnt, int size, int align);
+
+/**
+ * @brief Always allocate aligned memory from external memory.
+ *
+ * @param cnt    Number of continuing chunks of memory to allocate
+ * @param size   Size, in bytes, of a chunk of memory to allocate     
+ * @param align  Aligned size, in bits
+ * @return Pointer to currently aligned heap memory
+ */
+void *dl_lib_calloc_psram(int cnt, int size, int align);
+
+/**
+ * @brief Free aligned memory allocated by `dl_lib_calloc` or `dl_lib_calloc_psram` 
+ * 
+ * @param ptr    Pointer to free
+ */
+void dl_lib_free(void *ptr);
+
+/**
+ * @brief Does a fast version of the exp() operation on a floating point number.
+ *
+ * As described in https://codingforspeed.com/using-faster-exponential-approximation/
+ * Should be good til an input of 5 or so with a steps factor of 8.
+ *
+ * @param in Floating point input
+ * @param steps Approximation steps. More is more precise. 8 or 10 should be good enough for most purposes.
+ * @return Exp()'ed output
+ */
+fptp_t fast_exp(double x, int steps);
+
+/**
+ * @brief Does a fast version of the exp() operation on a floating point number.
+ *
+ * @param in Floating point input
+ * @return Exp()'ed output
+ */
+double fast_exp_pro(double x);
+
+/**
+ * @brief Does a softmax operation on a matrix.
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_softmax(const dl_matrix2d_t *in, dl_matrix2d_t *out);
+
+
+/**
+ * @brief Does a softmax operation on a quantized matrix.
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_softmax_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+/**
+ * @brief Does a sigmoid operation on a floating point number
+ *
+ * @param in Floating point input
+ * @return Sigmoid output
+ */
+
+fptp_t dl_sigmoid_op(fptp_t in);
+
+
+/**
+ * @brief Does a sigmoid operation on a matrix.
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_sigmoid(const dl_matrix2d_t *in, dl_matrix2d_t *out);
+
+/**
+ * @brief Does a tanh operation on a floating point number
+ *
+ * @param in        Floating point input number
+ * @return Tanh value
+ */
+fptp_t dl_tanh_op(fptp_t v);
+
+/**
+ * @brief Does a tanh operation on a matrix.
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_tanh(const dl_matrix2d_t *in, dl_matrix2d_t *out);
+
+
+/**
+ * @brief Does a relu (Rectifier Linear Unit) operation on a floating point number
+ *
+ * @param in        Floating point input
+ * @param clip      If value is higher than this, it will be clipped to this value
+ * @return Relu output
+ */
+fptp_t dl_relu_op(fptp_t in, fptp_t clip);
+
+/**
+ * @brief Does a ReLu operation on a matrix.
+ *
+ * @param in        Input matrix
+ * @param clip      If values are higher than this, they will be clipped to this value
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_relu(const dl_matrix2d_t *in, fptp_t clip, dl_matrix2d_t *out);
+
+/**
+ * @brief Fully connected layer operation
+ *
+ * @param in        Input vector
+ * @param weight    Weights of the neurons
+ * @param bias      Biases for the neurons. Can be NULL if a bias of 0 is required.
+ * @param out       Output array. Outputs are placed here. Needs to be an initialized, weight->w by in->h in size, matrix.
+ */
+void dl_fully_connect_layer(const dl_matrix2d_t *in, const dl_matrix2d_t *weight, const dl_matrix2d_t *bias, dl_matrix2d_t *out);
+
+/**
+ * @brief Pre-calculate the sqrtvari variable for the batch_normalize function.
+ * The sqrtvari matrix depends on the variance and epsilon values, which normally are constant. Hence,
+ * this matrix only needs to be calculated once. This function does that.
+ *
+ * @param 
+ * @return
+ */
+void dl_batch_normalize_get_sqrtvar(const dl_matrix2d_t *variance, fptp_t epsilon, dl_matrix2d_t *out);
+
+/**
+ * @brief Batch-normalize a matrix
+ *
+ * @param m         The matrix to normalize
+ * @param offset    Offset matrix
+ * @param scale     Scale matrix
+ * @param mean      Mean matrix
+ * @param sqrtvari  Matrix precalculated using dl_batch_normalize_get_sqrtvar
+ * @return
+ */
+void dl_batch_normalize(dl_matrix2d_t *m, const dl_matrix2d_t *offset, const dl_matrix2d_t *scale, 
+                        const dl_matrix2d_t *mean, const dl_matrix2d_t *sqrtvari);
+
+/**
+ * @brief Do a basic LSTM layer pass.
+ *
+ * @warning Returns state_h pointer, so do not free result.
+
+ * @param in        Input vector
+ * @param state_c   Internal state of the LSTM network
+ * @param state_h   Internal state (previous output values) of the LSTM network
+ * @param weights   Weights for the neurons
+ * @param bias      Bias for the neurons. Can be NULL if no bias is required
+ * @return          Output values of the neurons
+ */
+dl_matrix2d_t *dl_basic_lstm_layer(const dl_matrix2d_t *in, dl_matrix2d_t *state_c, dl_matrix2d_t *state_h, 
+                const dl_matrix2d_t *weight, const dl_matrix2d_t *bias);
+
+/**
+ * @brief Do a basic LSTM layer pass, partial quantized version.
+ * This LSTM function accepts 16-bit fixed-point weights and 32-bit float-point bias. 
+ *
+ * @warning Returns state_h pointer, so do not free result.
+
+ * @param in		Input vector
+ * @param state_c	Internal state of the LSTM network
+ * @param state_h	Internal state (previous output values) of the LSTM network
+ * @param weights	Weights for the neurons, need to be quantised 
+ * @param bias		Bias for the neurons. Can be NULL if no bias is required
+ * @return			Output values of the neurons
+ */
+dl_matrix2dq_t *dl_basic_lstm_layer_quantised_weights(const dl_matrix2d_t *in, dl_matrix2d_t *state_c, dl_matrix2d_t *state_h,
+				const dl_matrix2dq_t *weight, const dl_matrix2d_t *bias);
+
+/**
+ * @brief Do a fully-connected layer pass, fully-quantized version.
+ *
+ * @param in        Input vector
+ * @param weight    Weights of the neurons
+ * @param bias      Bias values of the neurons. Can be NULL if no bias is needed.
+ * @param shift     Number of bits to shift the result back by. See dl_lib_matrixq.h for more info
+ * @return          Output values of the neurons
+ */
+void dl_fully_connect_layer_q(const dl_matrix2dq_t *in, const dl_matrix2dq_t *weight, const dl_matrix2dq_t *bias, dl_matrix2dq_t *out, int shift);
+
+/**
+ * @brief Do a basic LSTM layer pass, fully-quantized version
+ *
+ * @warning Returns state_h pointer, so do not free result.
+
+ * @param in        Input vector
+ * @param state_c   Internal state of the LSTM network
+ * @param state_h   Internal state (previous output values) of the LSTM network
+ * @param weights   Weights for the neurons
+ * @param bias      Bias for the neurons. Can be NULL if no bias is required
+ * @param shift     Number of bits to shift the result back by. See dl_lib_matrixq.h for more info
+ * @return          Output values of the neurons
+ */
+dl_matrix2dq_t *dl_basic_lstm_layer_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *state_c, dl_matrix2dq_t *state_h,
+                const dl_matrix2dq_t *weight, const dl_matrix2dq_t *bias, int shift);
+
+/**
+ * @brief Batch-normalize a matrix, fully-quantized version
+ *
+ * @param m         The matrix to normalize
+ * @param offset    Offset matrix
+ * @param scale     Scale matrix
+ * @param mean      Mean matrix
+ * @param sqrtvari  Matrix precalculated using dl_batch_normalize_get_sqrtvar
+ * @param shift     Number of bits to shift the result back by. See dl_lib_matrixq.h for more info
+ * @return
+ */
+void dl_batch_normalize_q(dl_matrix2dq_t *m, const dl_matrix2dq_t *offset, const dl_matrix2dq_t *scale, 
+                        const dl_matrix2dq_t *mean, const dl_matrix2dq_t *sqrtvari, int shift);
+
+/**
+ * @brief Does a relu (Rectifier Linear Unit) operation on a fixed-point number
+ * This accepts and returns fixed-point 32-bit number with the last 15 bits being the bits after the decimal
+ * point. (Equivalent to a mantissa in a quantized matrix with exponent -15.)
+ *
+ * @param in        Fixed-point input
+ * @param clip      If value is higher than this, it will be clipped to this value
+ * @return Relu output
+ */
+qtp_t dl_relu_q_op(qtp_t in, qtp_t clip);
+
+/**
+ * @brief Does a ReLu operation on a matrix, quantized version
+ *
+ * @param in        Input matrix
+ * @param clip      If values are higher than this, they will be clipped to this value
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_relu_q(const dl_matrix2dq_t *in, fptp_t clip, dl_matrix2dq_t *out);
+
+/**
+ * @brief Does a sigmoid operation on a fixed-point number.
+ * This accepts and returns a fixed-point 32-bit number with the last 15 bits being the bits after the decimal
+ * point. (Equivalent to a mantissa in a quantized matrix with exponent -15.)
+ *
+ * @param in Fixed-point input
+ * @return Sigmoid output
+ */
+int dl_sigmoid_op_q(const int in);
+int16_t dl_sigmoid_op_q8(const int16_t in);
+/**
+ * @brief Does a sigmoid operation on a matrix, quantized version
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_sigmoid_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+/**
+ * @brief Does a tanh operation on a matrix, quantized version
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_tanh_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+/**
+ * @brief Does a tanh operation on a fixed-point number.
+ * This accepts and returns a fixed-point 32-bit number with the last 15 bits being the bits after the decimal
+ * point. (Equivalent to a mantissa in a quantized matrix with exponent -15.)
+ *
+ * @param in Fixed-point input
+ * @return tanh output
+ */
+int dl_tanh_op_q(int v);
+int16_t dl_tanh_op_q8(int16_t v);
+
+void load_mat_psram_mn4(void);
+void load_mat_psram_mn3(void);
+void free_mat_psram_mn4(void);
+void free_mat_psram_mn3(void);
+qtp_t dl_hard_sigmoid_op(qtp_t in, int exponent);
+qtp_t dl_hard_tanh_op(qtp_t in, int exponent);
+
+int16_t dl_table_tanh_op(int16_t in, int exponent);
+int16_t dl_table_sigmoid_op(int16_t in, int exponent);
+
+void dl_hard_sigmoid_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+void dl_hard_tanh_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+void dl_table_sigmoid_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+void dl_table_tanh_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+
+/**
+ * @brief Filter out the number greater than clip in the matrix, quantized version
+ *
+ * @param in        Input matrix
+ * @param clip      If values are higher than this, they will be clipped to this value
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_minimum(const dl_matrix2d_t *in, fptp_t clip, dl_matrix2d_t *out);
+
+/**
+ * @brief Filter out the number greater than clip in the matrix, float version
+ *
+ * @param in        Input matrix
+ * @param clip      If values are higher than this, they will be clipped to this value
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_minimum_q(const dl_matrix2dq_t *in, fptp_t clip, dl_matrix2dq_t *out);
+/**
+ * @brief Do a basic CNN layer pass.
+ *
+ * @Warning This just supports the single channel input image, and the output is single row matrix.
+            That is to say, the height of output is 1, and the weight of output is out_channels*out_image_width*out_image_height
+ *
+ * @param in             Input single channel image 
+ * @param weight         Weights of the neurons, weight->w = out_channels, weight->h = filter_width*filter_height
+ * @param bias           Bias for the CNN layer.
+ * @param filter_height  The height of convolution kernel
+ * @param filter_width   The width of convolution kernel
+ * @param out_channels   The number of output channels of convolution kernel
+ * @param stride_x       The step length of the convolution window in x(width) direction
+ * @param stride_y       The step length of the convolution window in y(height) direction
+ * @param pad            One of `"VALID"` or `"SAME"`, 0 is "VALID" and the other is "SAME"
+ * @param out            The result of CNN layer, out->h=1.
+ * @return               The result of CNN layer.
+ */
+dl_matrix2d_t *dl_basic_conv_layer(const dl_matrix2d_t *in, const dl_matrix2d_t *weight, const dl_matrix2d_t *bias, int filter_width, int filter_height, 
+                                   const int out_channels, const int stride_x, const int stride_y,  padding_state pad, const dl_matrix2d_t* out);
+
+
+/**
+ * @brief Do a basic CNN layer pass, quantised wersion.
+ *
+ * @Warning This just supports the single channel input image, and the output is single row matrix.
+            That is to say, the height of output is 1, and the weight of output is out_channels*out_image_width*out_image_height
+ *
+ * @param in             Input single channel image 
+ * @param weight         Weights of the neurons, weight->w = out_channels, weight->h = filter_width*filter_height,
+ * @param bias           Bias of the neurons.
+ * @param filter_height  The height of convolution kernel
+ * @param filter_width   The width of convolution kernel
+ * @param out_channels   The number of output channels of convolution kernel
+ * @param stride_x       The step length of the convolution window in x(width) direction
+ * @param stride_y       The step length of the convolution window in y(height) direction
+ * @param pad            One of `"VALID"` or `"SAME"`, 0 is "VALID" and the other is "SAME"
+ * @param out            The result of CNN layer, out->h=1
+ * @return               The result of CNN layer
+ */
+dl_matrix2d_t *dl_basic_conv_layer_quantised_weight(const dl_matrix2d_t *in, const dl_matrix2dq_t *weight, const dl_matrix2d_t *bias, int filter_width, int filter_height, 
+                                                     const int out_channels, const int stride_x, const int stride_y,  padding_state pad, const dl_matrix2d_t* out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32c3/dl_lib_coefgetter_if.h b/include/esp32c3/dl_lib_coefgetter_if.h
new file mode 100644
index 0000000..a21de8d
--- /dev/null
+++ b/include/esp32c3/dl_lib_coefgetter_if.h
@@ -0,0 +1,80 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_COEFGETTER_IF_H
+#define DL_LIB_COEFGETTER_IF_H
+
+#include "dl_lib_matrix.h"
+#include "dl_lib_matrixq.h"
+#include "dl_lib_matrixq8.h"
+#include "cJSON.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//Set this if the coefficient requested is a batch-normalization popvar matrix which needs to be preprocessed by
+//dl_batch_normalize_get_sqrtvar first.
+#define COEF_GETTER_HINT_BNVAR (1<<0)
+
+/*
+This struct describes the basic information of model data: 
+word_num: the number of wake words or speech commands
+word_list: the name list of wake words or speech commands
+thres_list: the threshold list of wake words or speech commands
+info_str: the string used to reflect the version and information of model data
+          which consist of the architecture of network, the version of model data, wake words and their threshold
+*/
+typedef struct {
+    int word_num;
+    char **word_list;
+    int *win_list;
+    float *thresh_list;
+    char *info_str;
+} model_info_t;
+
+/*
+Alphabet struct describes the basic grapheme or phoneme.
+item_num: the number of baisc item(grapheme or phonemr)
+items: the list of basic item
+*/
+typedef struct {
+    int item_num;
+    char **items;
+}alphabet_t;
+
+/*
+This struct describes a generic coefficient getter: a way to get the constant coefficients needed for a neural network.
+For the two getters, the name describes the name of the coefficient matrix, usually the same as the Numpy filename the
+coefficient was originally stored in. The arg argument can be used to optionally pass an additional user-defined argument
+to the getter (e.g. the directory to look for files in the case of the Numpy file loader getter). The hint argument
+is a bitwise OR of the COEF_GETTER_HINT_* flags or 0 when none is needed. Use the free_f/free_q functions to release the
+memory for the returned matrices, when applicable.
+*/
+typedef struct {
+    const dl_matrix2d_t* (*getter_f)(const char *name, void *arg, int hint);
+    const dl_matrix2dq_t* (*getter_q)(const char *name, void *arg, int hint);
+    const dl_matrix2dq8_t* (*getter_q8)(const char *name, void *arg, int hint);
+    void (*free_f)(const dl_matrix2d_t *m);
+    void (*free_q)(const dl_matrix2dq_t *m);
+    void (*free_q8)(const dl_matrix2dq8_t *m);
+    const model_info_t* (*getter_info)(void *arg);
+    const alphabet_t* (*getter_alphabet)(void *arg);
+    const cJSON* (*getter_config)(void *arg);
+} model_coeff_getter_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32c3/dl_lib_conv_queue.h b/include/esp32c3/dl_lib_conv_queue.h
new file mode 100644
index 0000000..7cb9bf9
--- /dev/null
+++ b/include/esp32c3/dl_lib_conv_queue.h
@@ -0,0 +1,180 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_CONV_QUEUE_H
+#define DL_LIB_CONV_QUEUE_H
+
+
+#include "dl_lib_matrix.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef float fptp_t;
+
+//Flags for matrices
+// #define DL_MF_FOREIGNDATA (0)  /*< Matrix *item data actually points to another matrix and should not be freed */
+
+//Float convolution FIFO queue. 
+typedef struct {
+    int n;          /*< the length of queue */
+    int c;          /*< the channel number of queue element*/
+    int front;      /*< the front(top) position of queue */
+    int flag;       /*< not used*/
+    fptp_t *item;   /*< Pointer to item array */
+} dl_conv_queue_t;
+
+/**
+ * @brief Allocate a convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The channel number of elements in the queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_conv_queue_t *dl_conv_queue_alloc(int n, int c);
+
+/**
+ * @brief Allocate a convolution queue from psram
+ *
+ * @param n     The length of queue
+ * @param c     The channel number of elements in the queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_conv_queue_t *dl_conv_queue_alloc_from_psram(int n, int c);
+
+/**
+ * @brief Free a convolution queue
+ *
+ * @param cq     The convolution queue to free
+ */
+void dl_conv_queue_free(dl_conv_queue_t *cq);
+
+void dl_conv_to_matrix2d(dl_conv_queue_t *cq, dl_matrix2d_t* out);
+
+/**
+ * @brief Move the front pointer of queue forward, 
+          the First(oldest) element become the last(newest) element, 
+ *
+ * @param cq    Input convolution queue
+ * @return      Pointer of oldest element  
+ */
+fptp_t *dl_conv_queue_pop(dl_conv_queue_t *cq);
+
+/**
+ * @brief  Remove the oldest element, then insert the input element at the end of queue
+ *
+ * @param cq     Input convolution queue
+ * @param item   The new element
+ */
+void dl_conv_queue_push(dl_conv_queue_t *cq, fptp_t* item);
+
+
+/**
+ * @brief   Get the pointer of element in the queue by offset
+ *
+ * @param cq      Input convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+fptp_t *dl_get_queue_item(dl_conv_queue_t *cq, int offset);
+
+/**
+ * @brief   Does a sigmoid operation on the one of element in the convolution queue.
+ * Gets the pointer of element in the convolution queue by offset, and does a sigmoid operation
+ * by this pointer, then return the pointer      
+ *
+ * @param cq      Input convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+fptp_t *dl_sigmoid_step(dl_conv_queue_t *cq, int offset);
+
+/**
+ * @brief   Does a tanh operation on the one of element in the convolution queue.
+ * Gets the pointer of element in the convolution queue by offset, and does a tanh operation
+ * by this pointer, then return the pointer  
+ *
+ * @param cq      Input convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+fptp_t *dl_tanh_step(dl_conv_queue_t *cq, int offset);
+
+/**
+ * @brief   Does a softmax operation on the one of element in the convolution queue.
+ * Gets the pointer of element in the convolution queue by offset, and does a softmax operation
+ * by this pointer, then return the pointer 
+ *
+ * @param cq      Input convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+fptp_t *dl_softmax_step(dl_conv_queue_t *cq, int offset);
+
+fptp_t *dl_relu_step(dl_conv_queue_t *cq, int offset);
+fptp_t *dl_relu_look(dl_matrix2d_t *cq, int offset);
+dl_matrix2d_t *dl_matrix_concat1(const dl_conv_queue_t *a, const dl_matrix2d_t *b);
+dl_matrix2d_t *dl_basic_lstm_layer1(const dl_conv_queue_t *in, dl_matrix2d_t *state_c, dl_matrix2d_t *state_h,
+                                   const dl_matrix2d_t *weight, const dl_matrix2d_t *bias);
+/**
+ * @brief Fast implement for 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        based on convolution queue.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is first element of output queue and should not be freed separately.
+ *
+ * @param in       Input convolution queue
+ * @param out      Output convolution queue
+ * @param rate     A positive int, the stride with which we sample input value 
+ * @param size     A positive int, the size of 1D-filter
+ * @param kernel   The kernel matrix of filter
+ * @param bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @return         The result of atrous convolution
+ */
+fptp_t *dl_atrous_conv1d_step(dl_conv_queue_t *in, dl_conv_queue_t *out, int rate, int size,
+                              dl_matrix2d_t* kernel, dl_matrix2d_t* bias);
+fptp_t *dl_look_conv_step(dl_conv_queue_t *in, dl_matrix2d_t *out, int rate, int size,
+                         dl_matrix2d_t* kernel, dl_matrix2d_t* bias);
+
+/**
+ * @brief Fast implement of dilation layer as follows
+ *
+ *               |-> [gate(sigmoid)] -|        
+ *      input -  |                    |-> (*) - output
+ *               |-> [filter(tanh)]  -|   
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is first element of output queue and should not be freed separately.
+ *
+ * @param in              Input convolution queue
+ * @param out             Output convolution queue
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param filter_kernel   The kernel matrix of filter
+ * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param gate_kernel     The kernel matrix of gate
+ * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
+ * @return                The result of dilation layer
+ */
+fptp_t *dl_dilation_layer(dl_conv_queue_t *in, dl_conv_queue_t *out, int rate, int size,
+                          dl_matrix2d_t* filter_kernel, dl_matrix2d_t* filter_bias,
+                          dl_matrix2d_t* gate_kernel, dl_matrix2d_t* gate_bias);
+
+
+void test_atrous_conv(int size, int rate, int in_channel, int out_channel);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32c3/dl_lib_convq8_queue.h b/include/esp32c3/dl_lib_convq8_queue.h
new file mode 100644
index 0000000..28c5da7
--- /dev/null
+++ b/include/esp32c3/dl_lib_convq8_queue.h
@@ -0,0 +1,303 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_CONVQ8_QUEUE_H
+#define DL_LIB_CONVQ8_QUEUE_H
+
+
+#include "dl_lib_matrixq.h"
+#include "dl_lib_matrixq8.h"
+#include "dl_lib_conv_queue.h"
+#include "dl_lib_convq_queue.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//[nch, n, c]
+typedef struct {
+    int n;           /*< the length of queue */
+    int c;           /*< the number of queue element*/
+    int front;       /*< the front(top) position of queue */
+    int nch;         /*< the channel of queue */
+    int exponent;    /*< The values in items should be multiplied by pow(2,exponent) 
+                         to get the real values */
+    q8tp_t *itemq;    /*< Pointer to item array */
+} dl_convq8_queue_t;
+
+/**
+ * @brief Allocate a fixed-point convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq8_queue_t *dl_convq8_queue_alloc(int n, int c);
+
+/**
+ * @brief Allocate a fixed-point convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @param c     The channel of queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq8_queue_t *dl_convq8_queue_alloc_mc(int n, int c, int nch);
+
+/**
+ * @brief Allocate a bit fixed-point convolution queue from PSRAM
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @param nch     The channel of queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq8_queue_t *dl_convq8_queue_alloc_mc_from_psram(int n, int c, int nch);
+
+/**
+ * @brief Free a fixed-point convolution queue
+ *
+ * @param cq     The fixed-point convolution queue to free
+ */
+void dl_convq8_queue_free(dl_convq8_queue_t *cq);
+
+/**
+ * @brief Set itemq of convolution queue to 0
+ *
+ * @param cq     The fixed-point convolution queue to free
+ */
+void dl_convq8_queue_bzero(dl_convq8_queue_t *cqm);
+
+/**
+ * @brief Move the front pointer of queue forward, 
+          the First(oldest) element become the last(newest) element, 
+ *
+ * @param cq    Input fixed-point convolution queue
+ * @return      Pointer of oldest element  
+ */
+q8tp_t *dl_convq8_queue_pop(dl_convq8_queue_t *cq);
+q8tp_t *dl_convq8_queue_popn(dl_convq8_queue_t *cq, int n);
+
+/**
+ * @brief  Insert the float-point element at the end of queue.
+ *         The precision of fixed-point numbers is described by the Qm.f notation,  
+ *
+ * @param cq     Input fixed-point convolution queue
+ * @param item   The float-point element
+ * @param m_bit  The number of integer bits including the sign bits
+ * @param f_bit  The number of fractional bits
+ */
+void dl_convq8_queue_push_by_qmf(dl_convq8_queue_t *cq, fptp_t* item, int m_bit, int f_bit);
+
+/**
+ * @brief   Get the pointer of element in the queue by offset
+ *
+ * @param cq      Input fixed-point convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+q8tp_t *dl_get_queue_itemq8(dl_convq8_queue_t *cq, int offset);
+
+/**
+ * @brief   Get the pointer of element in the queue by offset
+ *
+ * @param cq      Input fixed-point convolution queue
+ * @param offset  Offset from the front of the queue
+ * @param ch      Channel index of queue
+ * @return        Pointer of the element
+ */
+q8tp_t *dl_get_queue_itemq8_mc(dl_convq8_queue_t *cq, int offset, int ch);
+
+/**
+ * @brief Fast and quantised implement for 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        based on convolution queue.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param kernel          Kernel matrix of filter
+ * @param bias            The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param out_exponent    Shift ratio used in dot operation between two 16-bit fixed point vector 
+ * @param offset          Offset used to calculate the beginning of input conv queue  
+ * @param prenum          The num to control the parameter size of preload operation  
+ * @return                The result of atrous convolution
+ */
+void dl_atrous_conv1dq8_steps(dl_convq8_queue_t *in, dl_convq8_queue_t *out, int rate, int size,
+                            dl_matrix2dq8_t* kernel, dl_matrix2dq8_t* bias, 
+                            int out_exponent, int offset, int prenum);
+
+/**
+ * @brief Fast implement of dilation layer as follows
+ *
+ *               |-> [gate(sigmoid)] -|        
+ *      input -  |                    |-> (*) - output
+ *               |-> [filter(tanh)]  -|   
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param filter_kernel   The kernel matrix of filter
+ * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param gate_kernel     The kernel matrix of gate
+ * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
+ * @param offset          Offset used to calculate the beginning of input conv queue  
+ * @param prenum          The num to control the parameter size of preload operation  
+ * @return                The result of dilation layer
+ */
+void dl_dilation_layerq8_steps(dl_convq8_queue_t *in, dl_convq8_queue_t *out, int rate, int size,
+                                dl_matrix2dq8_t* filter_kernel, dl_matrix2dq8_t* filter_bias,
+                                dl_matrix2dq8_t* gate_kernel, dl_matrix2dq8_t* gate_bias,
+                                int offset, int prenum);
+
+
+
+
+dl_conv_queue_t *dl_convq8_queue_add(dl_convq8_queue_t *cq1, dl_convq8_queue_t *cq2);
+
+int8_t dl_sigmoid_lutq8(int in);
+/**
+ * @brief Allocate a 8-bit fixed-point Multi-Channel convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @param nch　　The channel number
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq8_queue_t **dl_convq8_queue_mc_alloc(int n, int c, int nch);
+
+/**
+ * @brief Free a 8-bit fixed-point Multi-Channel convolution queue
+ *
+ * @param cqm     The fixed-point convolution queue to free
+ * @param nch     The channel number
+ */
+void dl_convq8_queue_mc_free(dl_convq8_queue_t **cqm, int nch);
+
+/**
+ * @brief Tanh activation function for 8-bit fixed-point Multi-Channel convolution queue input
+ *
+ * @param cqm     Input 8-bit fixed-point Multi-Channel convolution queue
+ * @param offset  Offset used to calculate the beginning of input conv queue 
+ * @param nch     The channel number
+ */
+void dl_tanh_convq8_mc(dl_convq8_queue_t **cqm, int offset, int nch);
+
+/**
+ * @brief Fast and quantised 16-bit implement for Multi-channel 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        Usually, this layer is used as first layer for 8-bit network.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          Input is a 16-bit queue point, Output is an 8-bit queue point.
+ *
+ * @param in              Input 16bit fixed-point convolution queue array
+ * @param out             Output 8bit fixed-point convolution queue array
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param kernel          The kernel matrix of filter
+ * @param bias            The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param out_exponent    Exponent of output
+ * @param offset          Offset used to calculate the beginning of input conv queue  
+ * @param prenum          The num to control the parameter size of preload operation            
+ */
+void dl_atrous_conv1dq8_16in_mc_steps(dl_convq_queue_t **in, dl_convq8_queue_t **out, int nch, int rate, int size,
+                                        dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int out_exponent, int offset, int prenum);
+
+/**
+ * @brief Fast and quantised 8-bit implement for Multi-channel 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        based on convolution queue.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input 8bit fixed-point convolution queue array
+ * @param out             Output 8bit fixed-point convolution queue array
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param kernel          The kernel matrix of filter
+ * @param bias            The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param out_exponent    Exponent of output
+ * @param offset          Offset used to calculate the beginning of input conv queue 
+ * @param prenum          The num to control the parameter size of preload operation            
+ */
+void dl_atrous_conv1dq8_mc_steps(dl_convq8_queue_t **in, dl_convq8_queue_t **out,
+                                int nch, int rate, int size,
+                                dl_matrix2dq8_t* kernel, dl_matrix2dq8_t* bias, 
+                                int out_exponent, int offset, int prenum);
+
+/**
+ * @brief Fast implement of 8-bit dilation layer as follows
+ *
+ *               |-> [gate(sigmoid)] -|        
+ *      input -  |                    |-> (*) - output
+ *               |-> [filter(tanh)]  -|   
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input 8-bit fixed-point convolution queue
+ * @param out             Output 8-bit fixed-point convolution queue
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param filter_kernel   The kernel matrix of filter
+ * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param gate_kernel     The kernel matrix of gate
+ * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
+ * @param offset          Offset used to calculate the beginning of input conv queue 
+ * @param prenum          The num to control the parameter size of preload operation
+ */
+void dl_dilation_layerq8_mc_steps(dl_convq8_queue_t **in, dl_convq8_queue_t **out, int nch, int rate, int size,
+                                    dl_matrix2dq8_t* filter_kernel, dl_matrix2dq8_t* filter_bias,
+                                    dl_matrix2dq8_t* gate_kernel, dl_matrix2dq8_t* gate_bias,
+                                    int offset, int prenum);    
+
+void dl_convq8_queue_mc_bzero(dl_convq8_queue_t **cqm, int nch);
+
+
+
+dl_convq8_queue_t *dl_convq8_queue_alloc_from_psram(int n, int c);
+
+qtp_t *dl_dilation_layerq16_8(dl_convq_queue_t *in, dl_convq8_queue_t *out, int rate, int size,
+                            dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+                            dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias, int prenum);
+
+
+qtp_t *dl_dilation_layerq8(dl_convq8_queue_t *in, dl_convq8_queue_t *out, int rate, int size,
+                            dl_matrix2dq8_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+                            dl_matrix2dq8_t* gate_kernel, dl_matrix2dq_t* gate_bias, int prenum);
+
+dl_matrix2dq8_t *dl_convq8_lstm_layer(const dl_convq8_queue_t *in, dl_convq8_queue_t *out, dl_matrix2dq8_t *state_c,
+                                      dl_matrix2dq8_t *state_h, const dl_matrix2dq8_t *in_weight, const dl_matrix2dq8_t *h_weight,
+                                      const dl_matrix2dq_t *bias, int prenum);
+
+qtp_t *dl_atrous_conv1dq8_16_s3(dl_convq8_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                                 dl_matrix2dq8_t* kernel, dl_matrix2dq_t* bias, int prenum);
+
+void print_convq8(dl_convq8_queue_t *cq, int offset);
+void print_convq(dl_convq_queue_t *cq, int offset);
+void dl_relu_convq8(dl_convq8_queue_t *cq);
+
+void lstmq8_free(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32c3/dl_lib_convq_queue.h b/include/esp32c3/dl_lib_convq_queue.h
new file mode 100644
index 0000000..ff190fe
--- /dev/null
+++ b/include/esp32c3/dl_lib_convq_queue.h
@@ -0,0 +1,382 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_CONVQ_QUEUE_H
+#define DL_LIB_CONVQ_QUEUE_H
+
+#include "dl_lib_matrixq.h"
+#include "dl_lib_conv_queue.h"
+#include "dl_lib.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//fixed-point convolution FIFO queue. 
+//[nch, n, c]
+typedef struct {
+    int n;           /*< the length of queue */
+    int c;           /*< the number of queue element*/
+    int front;       /*< the front(top) position of queue */
+    int nch;         /*< the multiple of queue*/
+    int exponent;    /*< The values in items should be multiplied by pow(2,exponent) 
+                         to get the real values */
+    qtp_t *itemq;    /*< Pointer to item array */
+} dl_convq_queue_t;
+
+/**
+ * @brief Allocate a fixed-point convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq_queue_t *dl_convq_queue_alloc(int n, int c);
+
+/**
+ * @brief Allocate a fixed-point convolution queue from PSRAM
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq_queue_t *dl_convq_queue_alloc_from_psram(int n, int c);
+
+/**
+ * @brief Allocate a fixed-point multi-channel convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @param nch   The channel of conv queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq_queue_t *dl_convq_queue_alloc_mc(int n, int c, int nch);
+
+/**
+ * @brief Allocate a fixed-point multi-channel convolution queue from PSRAM
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @param nch   The channel of conv queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq_queue_t *dl_convq_queue_alloc_mc_from_psram(int n, int c, int nch);
+
+
+void dl_convq_to_matrix2dq(dl_convq_queue_t *cq, dl_matrix2dq_t* out, int row);
+
+/**
+ * @brief Free a fixed-point convolution queue
+ *
+ * @param cq     The fixed-point convolution queue to free
+ */
+void dl_convq_queue_free(dl_convq_queue_t *cq);
+
+/**
+ * @brief Set itemq of convolution queue to 0
+ *
+ * @param cq     The fixed-point convolution queue point
+ */
+void dl_convq_queue_bzero(dl_convq_queue_t *cq);
+
+/**
+ * @brief Move the front pointer of queue forward, 
+          the First(oldest) element become the last(newest) element, 
+ *
+ * @param cq    Input fixed-point convolution queue
+ * @return      Pointer of oldest element  
+ */
+qtp_t *dl_convq_queue_pop(dl_convq_queue_t *cq);
+qtp_t *dl_convq_queue_popn(dl_convq_queue_t *cq, int n);
+/**
+ * @brief  Remove the oldest element, then insert the input element at the end of queue
+ *
+ * @param cq     Input fixed-point convolution queue
+ * @param item   The new element
+ */
+void dl_convq_queue_push(dl_convq_queue_t *cq, dl_matrix2dq_t *a, int shift);
+
+/**
+ * @brief  Insert the float-point element at the end of queue.
+ *         The precision of fixed-point numbers is described by the Qm.f notation,  
+ *
+ * @param cq     Input fixed-point convolution queue
+ * @param item   The float-point element
+ * @param m_bit  The number of integer bits including the sign bits
+ * @param f_bit  The number of fractional bits
+ */
+void dl_convq_queue_push_by_qmf(dl_convq_queue_t *cq, fptp_t* item, int m_bit, int f_bit);
+
+void dl_convq16_queue_push_by_qmf(dl_convq_queue_t *cq, fptp_t* item, int m_bit, int f_bit);
+
+dl_conv_queue_t *dl_queue_from_convq(dl_convq_queue_t *cq1);
+
+/**
+ * @brief   Get the pointer of element in the queue by offset
+ *
+ * @param cq        Input fixed-point convolution queue
+ * @param last_num  Offset from the front of the queue
+ * @return          Pointer of the element
+ */
+qtp_t *dl_get_queue_itemq(dl_convq_queue_t *cq, int last_num);
+
+/**
+ * @brief   Get the pointer of element in the queue by offset
+ *
+ * @param cq        Input fixed-point convolution queue
+ * @param offset    Offset from the front of the queue
+ * @param ch        Channel index of convolution queue 
+ * @return          Pointer of the element
+ */
+qtp_t *dl_get_queue_itemq_mc(dl_convq_queue_t *cq, int offset, int ch);
+
+/**
+ * @brief   Does a tanh operation on the one of element in the convolution queue.
+ *          Gets the pointer of element in the convolution queue by offset, and does a 
+ *          tanh operation by this pointer, then return the pointer 
+ *
+ * @param cq      Input fixed-point convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+void dl_tanh_convq(dl_convq_queue_t *cq, int offset);
+
+/**
+ * @brief   Does a tanh operation on the one of element in multi channel convolution queue.
+ *          Gets the pointer of element in the convolution queue by offset, and does a 
+ *          tanh operation by this pointer, then return the pointer 
+ *
+ * @param cq      Input fixed-point multi channnel convolution queue
+ * @param offset  Offset from the front of the queue
+ * @param nch     The channel number of cqm
+ * @return        Pointer of the element
+ */
+void dl_tanh_convq_mc(dl_convq_queue_t **cqm, int offset, int nch);
+
+/**
+ * @brief   Does a relu operation on the one of element in the convolution queue.
+ *          Gets the pointer of element in the convolution queue by offset, and does a 
+ *          relu operation by this pointer, then return the pointer 
+ *
+ * @param cq      Input fixed-point convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+void dl_relu_convq(dl_convq_queue_t *cq, fptp_t clip, int last_num);
+
+/**
+ * @brief   Does a softmax operation on the one of element in the convolution queue.
+ *          Gets the pointer of element in the convolution queue by offset, input data
+            stay as it is. Results are saved into the *out* array. 
+ *
+ * @param cq      Input fixed-point convolution queue
+ * @param offset  Offset from the front of the queue
+ * @param out     Old array to re-use. Passing NULL will allocate a new matrix.
+ * @return        softmax results
+ */
+fptp_t * dl_softmax_step_q(dl_convq_queue_t *cq, int offset, fptp_t *out);
+
+/**
+ * @brief Fast and quantised implement for 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        based on convolution queue.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in       Input fixed-point convolution queue
+ * @param out      Output fixed-point convolution queue
+ * @param rate     A positive int, the stride with which we sample input value 
+ * @param size     A positive int, the size of 1D-filter
+ * @param kernel   The kernel matrix of filter
+ * @param bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param shift    Shift ratio used in dot operation between two 16-bit fixed point vector 
+ * @return         The result of atrous convolution
+ */
+qtp_t * dl_atrous_conv1dq(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                          dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int shift, int prenum);
+
+/**
+ * @brief Fast implement of dilation layer as follows
+ *
+ *               |-> [gate(sigmoid)] -|        
+ *      input -  |                    |-> (*) - output
+ *               |-> [filter(tanh)]  -|   
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param filter_kernel   The kernel matrix of filter
+ * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param gate_kernel     The kernel matrix of gate
+ * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
+ * @param filter_shift          Shift ratio used in filter operation between two 16-bit fixed point vector
+ * @param gate_shift            Shift ratio used in gate operation between two 16-bit fixed point vector
+ * @return                The result of dilation layer
+ */
+qtp_t *dl_dilation_layerq_steps(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+   dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+   dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias,
+   int filter_shift, int gate_shift, int offset, int prenum);
+
+
+qtp_t *dl_dilation_layerq(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                          dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+                          dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias,
+                          int filter_shift, int gate_shift, int prenum);
+
+qtp_t *dl_dilation_layerq16(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                             dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+                             dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias, int prenum);
+
+
+qtp_t *dl_atrous_conv1dq_steps(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                                 dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int shift, int offset, int prenum);
+
+/**
+ * @brief Add a pair of fixed-point convolution queue item-by-item, and return float-point convolution queue
+ *
+ * @param cq1      First fixed-point convolution queue
+ * @param cq2      Seconf fixed-point convolution queue
+ * @return         The result of float-point convolution queue
+ */
+dl_conv_queue_t *dl_convq_queue_add(dl_convq_queue_t *cq1, dl_convq_queue_t *cq2);
+
+/**
+ * @brief Fast implement of LSTM layer by dl_atrous_conv1dq function
+ *
+ * @Warning LSTM kernel is split into two part, the first part input is the last layer output, 
+ *           and kernel is parameter *in_weight*. The second part input is the last frame LSTM output,
+ *           the kernel is parameters *h_weight*.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param state_c         Internal state of the LSTM network
+ * @param state_h         Internal state (previous output values) of the LSTM network
+ * @param in_weight       the LSTM kernel needed by first part
+ * @param h_weight        the LSTM kernel needed by second part
+ * @param bias            The bias matrix of LSTM. Can be NULL if a bias of 0 is required.
+ * @in_shift              Shift ratio used in first part
+ * @h_shift               Shift ratio used in second part
+ * @return                The result of LSTM layer
+ */
+dl_matrix2dq_t *dl_convq_lstm_layer(const dl_convq_queue_t *in, dl_convq_queue_t *out, dl_matrix2dq_t *state_c,
+                                    dl_matrix2dq_t *state_h, const dl_matrix2dq_t *in_weight, const dl_matrix2dq_t *h_weight,
+                                    const dl_matrix2dq_t *bias, int in_shift, int h_shift, int prenum);
+dl_matrix2dq_t *dl_basic_lstm_layer1_q(const dl_convq_queue_t *in, dl_matrix2dq_t *state_c, dl_matrix2dq_t *state_h,
+                                       const dl_matrix2dq_t *weight, const dl_matrix2dq_t *bias, int step, int shift);
+
+dl_matrix2dq_t *dl_convq16_lstm_layer(dl_convq_queue_t *in, dl_convq_queue_t *out, dl_matrix2dq_t *state_c,
+                                       dl_matrix2dq_t *state_h, dl_matrix2dq_t *in_weight, dl_matrix2dq_t *h_weight,
+                                       dl_matrix2dq_t *bias, int prenum);
+
+/**
+ * @brief Allocate a fixed-point multi channel convolution queue 
+ *
+ * @param n     The length of queue
+ * @param c     The channel number of elements in the queue
+ * @param nch   the channel numbet of convolution queue 
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq_queue_t **dl_convq_queue_mc_alloc(int n, int c, int nch);
+
+/**
+ * @brief Free a fixed-point multi channel convolution queue
+ *
+ * @param cqm     The fixed-point convolution queue to free
+ * @param nch     The channel number of cqm
+ */
+void dl_convq_queue_mc_free(dl_convq_queue_t **cqm, int nch);
+
+/**
+ * @brief Fast and quantised implement for 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        based on convolution queue.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in       Input fixed-point convolution queue
+ * @param out      Output fixed-point convolution queue
+ * @param nch      The channel number of input 
+ * @param rate     A positive int, the stride with which we sample input value 
+ * @param size     A positive int, the size of 1D-filter
+ * @param kernel   The kernel matrix of filter
+ * @param bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param shift    Shift ratio used in dot operation between two 16-bit fixed point vector 
+ * @param offset   the offset to calculate input convq
+ * @param prenum   the preload size, 0: do not use preload function
+ * @return         The result of atrous convolution
+ */
+qtp_t *dl_atrous_conv1dq_mc_steps(  dl_convq_queue_t **in,
+                                    dl_convq_queue_t **out,
+									int nch,
+									int rate,
+									int size,
+                                    dl_matrix2dq_t* kernel,
+									dl_matrix2dq_t* bias,
+									int shift,
+									int offset,
+									int prenum);
+
+/**
+ * @brief Fast implement of dilation layer as follows for multi channel input
+ *
+ *               |-> [gate(sigmoid)] -|        
+ *      input -  |                    |-> (*) - output
+ *               |-> [filter(tanh)]  -|   
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param nch             The channel number of input 
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param filter_kernel   The kernel matrix of filter
+ * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param gate_kernel     The kernel matrix of gate
+ * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
+ * @param filter_shift    Shift ratio used in filter operation between two 16-bit fixed point vector
+ * @param gate_shift      Shift ratio used in gate operation between two 16-bit fixed point vector
+ * @param offset          The offset to calculate input convq
+ * @param prenum          The preload size, 0: do not use preload function
+ * @return                The result of dilation layer
+ */
+qtp_t *dl_dilation_layerq_mc_steps( dl_convq_queue_t **in, 
+									dl_convq_queue_t **out,
+									int nch,
+									int rate,
+									int size,
+                                    dl_matrix2dq_t* filter_kernel,
+									dl_matrix2dq_t* filter_bias,
+                                    dl_matrix2dq_t* gate_kernel,
+									dl_matrix2dq_t* gate_bias,
+                                    int filter_shift,
+									int gate_shift,
+									int offset,
+									int prenum);
+
+void test_atrous_convq(int size, int rate, int in_channel, int out_channel);
+void test_lstm_convq(int size, int in_dim, int lstm_cell);
+void dl_nn_tanh_i162(dl_convq_queue_t **cqm, int offset, int nch);
+void dl_copy_queue_item_by_qmf(dl_convq_queue_t *cq, fptp_t* item, int m_bit, int f_bit, int offset, int ch);
+void dl_convq_queue_mc_bzero(dl_convq_queue_t **cqm, int nch);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32c3/dl_lib_matrix.h b/include/esp32c3/dl_lib_matrix.h
new file mode 100644
index 0000000..59f7d79
--- /dev/null
+++ b/include/esp32c3/dl_lib_matrix.h
@@ -0,0 +1,257 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_MATRIX_H
+#define DL_LIB_MATRIX_H
+
+#ifdef ESP_PLATFORM
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+#include "freertos/queue.h"
+#include "esp_system.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef float fptp_t;
+
+#if CONFIG_BT_SHARE_MEM_REUSE
+extern multi_heap_handle_t gst_heap;
+#endif
+
+//Flags for matrices
+#define DL_MF_FOREIGNDATA 1  /*< Matrix pointer and item data actually points to another matrix and should not be freed */
+#define DL_MF_FOREIGNITEM 2  /*< Only item data actually points to another matrix and should not be freed */
+
+//'Normal' float matrix
+typedef struct {
+    int w;          /*< Width */
+    int h;          /*< Height */
+    int stride;     /*< Row stride, essentially how many items to skip to get to the same position in the next row */
+    int flags;      /*< Flags. OR of DL_MF_* values */
+    fptp_t *item;   /*< Pointer to item array */
+} dl_matrix2d_t;
+
+//Macro to quickly access the raw items in a matrix
+#define DL_ITM(m, x, y) m->item[(x)+(y)*m->stride]
+
+
+/**
+ * @brief Allocate a matrix
+ *
+ * @param w     Width of the matrix
+ * @param h     Height of the matrix
+ * @return The matrix, or NULL if out of memory
+ */
+dl_matrix2d_t *dl_matrix_alloc(int w, int h);
+
+
+/**
+ * @brief Free a matrix
+ * Frees the matrix structure and (if it doesn't have the DL_MF_FOREIGNDATA flag set) the m->items space as well.
+ *
+ * @param m     Matrix to free
+ */
+void dl_matrix_free(dl_matrix2d_t *m);
+
+/**
+ * @brief Zero out the matrix
+ * Sets all entries in the matrix to 0.
+ *
+ * @param m     Matrix to zero
+ */
+void dl_matrix_zero(dl_matrix2d_t *m);
+
+/**
+ * @brief Copy the matrix into psram
+ * Copy the matrix from flash or iram/psram into psram
+ *
+ * @param m     Matrix to zero
+ */
+dl_matrix2d_t *dl_matrix_copy_to_psram(const dl_matrix2d_t *m);
+
+/**
+ * @brief Generate a new matrix using a range of items from an existing matrix.
+ * When using this, the data of the new matrix is not allocated/copied but it re-uses a pointer
+ * to the existing data. Changing the data in the resulting matrix, as a result, will also change
+ * the data in the existing matrix that has been sliced.
+ *
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @param in    Old matrix (with foreign data) to re-use. Passing NULL will allocate a new matrix.
+ * @return The resulting slice matrix, or NULL if out of memory
+ */
+dl_matrix2d_t *dl_matrix_slice(const dl_matrix2d_t *src, int x, int y, int w, int h, dl_matrix2d_t *in);
+
+/**
+ * @brief select a range of items from an existing matrix and flatten them into one dimension.
+ *
+ * @Warning The results are flattened in row-major order.
+ *   
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @param in    Old matrix to re-use. Passing NULL will allocate a new matrix.
+ * @return  The resulting flatten matrix, or NULL if out of memory
+ */
+dl_matrix2d_t *dl_matrix_flatten(const dl_matrix2d_t *src, int x, int y, int w, int h, dl_matrix2d_t *in);
+
+/**
+ * @brief Generate a matrix from existing floating-point data
+ *
+ * @param w     Width of resulting matrix
+ * @param h     Height of resulting matrix
+ * @param data  Data to populate matrix with
+ * @return A newaly allocated matrix populated with the given input data, or NULL if out of memory.
+ */
+dl_matrix2d_t *dl_matrix_from_data(int w, int h, int stride, const void *data);
+
+
+/**
+ * @brief Multiply a pair of matrices item-by-item: res=a*b
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Multiplicated data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrix_mul(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *res);
+
+/**
+ * @brief Do a dotproduct of two matrices : res=a.b
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ */
+void dl_matrix_dot(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *res);
+
+/**
+ * @brief Add a pair of matrices item-by-item: res=a-b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Added data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrix_add(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *out);
+
+
+/**
+ * @brief Divide a pair of matrices item-by-item: res=a/b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Divided data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrix_div(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *out);
+
+/**
+ * @brief Subtract a matrix from another, item-by-item: res=a-b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Subtracted data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrix_sub(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *out);
+
+/**
+ * @brief Add a constant to every item of the matrix
+ *
+ * @param subj  Matrix to add the constant to
+ * @param add   The constant
+ */
+void dl_matrix_add_const(dl_matrix2d_t *subj, const fptp_t add);
+
+
+/**
+ * @brief Concatenate the rows of two matrices into a new matrix
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @return A newly allocated array with as avlues a|b
+ */
+dl_matrix2d_t *dl_matrix_concat(const dl_matrix2d_t *a, const dl_matrix2d_t *b);
+
+dl_matrix2d_t *dl_matrix_concat_h( dl_matrix2d_t *a, const dl_matrix2d_t *b);
+
+/**
+ * @brief Print the contents of a matrix to stdout. Used for debugging.
+ *
+ * @param a     The matrix to print.
+ */
+void dl_printmatrix(const dl_matrix2d_t *a);
+
+/**
+ * @brief Return the average square error given a correct and a test matrix.
+ *
+ * ...Well, more or less. If anything, it gives an indication of the error between
+ * the two. Check the code for the exact implementation.
+ *
+ * @param a     First of the two matrices to compare
+ * @param b     Second of the two matrices to compare
+ * @return value indicating the relative difference between matrices
+ */
+float dl_matrix_get_avg_sq_err(const dl_matrix2d_t *a, const dl_matrix2d_t *b);
+
+
+
+/**
+ * @brief Check if two matrices have the same shape, that is, the same amount of rows and columns
+ *
+ * @param a     First of the two matrices to compare
+ * @param b     Second of the two matrices to compare
+ * @return true if the two matrices are shaped the same, false otherwise.
+ */
+int dl_matrix_same_shape(const dl_matrix2d_t *a, const dl_matrix2d_t *b);
+
+
+/**
+ * @brief Get a specific item from the matrix
+ *
+ * Please use these for external matrix access instead of DL_ITM
+ *
+ * @param m     Matrix to access
+ * @param x     Column address
+ * @param y     Row address
+ * @return Value in that position
+ */
+inline static fptp_t dl_matrix_get(const dl_matrix2d_t *m, const int x, const int y) { 
+    return DL_ITM(m, x, y);
+}
+
+/**
+ * @brief Set a specific item in the matrix to the given value
+ *
+ * Please use these for external matrix access instead of DL_ITM
+ *
+ * @param m     Matrix to access
+ * @param x     Column address
+ * @param y     Row address
+ * @param val   Value to write to that position
+ */
+inline static void dl_matrix_set(dl_matrix2d_t *m, const int x, const int y, fptp_t val) { 
+    DL_ITM(m, x, y)=val;
+}
+
+void matrix_get_range(const dl_matrix2d_t *m, fptp_t *rmin, fptp_t *rmax);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/include/esp32c3/dl_lib_matrixq.h b/include/esp32c3/dl_lib_matrixq.h
new file mode 100644
index 0000000..8ad397b
--- /dev/null
+++ b/include/esp32c3/dl_lib_matrixq.h
@@ -0,0 +1,387 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_MATRIXQ_H
+#define DL_LIB_MATRIXQ_H
+
+#include <stdint.h>
+#include "dl_lib_matrix.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int16_t qtp_t;
+
+//Quantized matrix. Uses fixed numbers and has the storage for the rows/columns inverted 
+//for easy use as a multiplicand without stressing out the flash cache too much.
+typedef struct {
+    int w;
+    int h;
+    int stride; //Normally equals h, not w!
+    int flags;
+    int exponent; //The values in items should be multiplied by pow(2,exponent) to get the real values.
+    qtp_t *itemq;
+} dl_matrix2dq_t;
+
+#define DL_QTP_SHIFT 15
+#define DL_QTP_RANGE ((1<<DL_QTP_SHIFT)-1)
+#define DL_ITMQ(m, x, y) m->itemq[(y)+(x)*m->stride]
+#define DL_QTP_EXP_NA 255 //non-applicable exponent because matrix is null
+
+#define DL_SHIFT_AUTO 32
+
+/**
+ * @info About quantized matrices and shift values
+ *
+ * Grab a coffee (or tea, or hot water)  and sit down when you read this for the first 
+ * time. Quantized matrices can speed up your operations, but come with some quirks, and
+ * it's good to understand how they work before using them.
+ *
+ * The data in the quantized matrix type is stored similarily to floating-point types:
+ * when storing a real value, the value is stored as a mantissa (base number) and an
+ * exponent. The 'real' value that can be re-derived from those two numbers is something
+ * similar to mantissa*2^exponent. Up to this point, there's not that much difference from 
+ * the standard floating point implementations like e.g. IEEE-754.
+ *
+ * The difference with respect to quantized matrices is that for a quantized matrix, it is 
+ * assumed all values stored have more-or-less the same order of magnitude. This allows the
+ * matrix to only store all the mantissas, while the exponents are shared; there is only one 
+ * exponent for the entire matrix. This makes it quicker to handle matrix operations - the
+ * logic to fix the exponents only needs to happen once, while the rest can be done in simple
+ * integer arithmetic. It also nets us some memory savings - while normally a floating point
+ * number is 32-bit, storing only 16-bit mantissas as the matrix items almost halves the 
+ * memory requirements.
+ *
+ * While most of the details of handling the intricacies of the quantized matrixes are done
+ * transparently by the code in dl_lib_matrixq.c, some implementation details leak out, 
+ * specifically in places where addition/subtraction/division happens.
+ *
+ * The problem is that the routines do not know what the size of the resulting operation is. For
+ * instance, when adding two matrices of numbers, the resulting numbers *could* be large enough
+ * to overflow the mantissa of the result if the exponent is the same. However, if by default we
+ * assume the mantissas needs to be scaled back, we may lose precision.
+ *
+ * In order to counter this, all operations that have this issue have a ``shift`` argument. If 
+ * the argument is zero, the routine will be conservative, that is, increase the exponent of 
+ * the result to such an extent it's mathematically impossible a value in the result will exceed
+ * the maximum value that can be stored. However, when this argument is larger than zero, the
+ * algorithm will hold back on this scaling by the indicated amount of bits, preserving precision
+ * but increasing the chance of some of the calculated values not fitting in the mantissa anymore.
+ * If this happens, the value will be clipped to the largest (or, for negative values, smallest)
+ * value possible. (Neural networks usually are okay with this happening for a limited amount
+ * of matrix indices).
+ *
+ * For deciding on these shift values, it is recommended to start with a shift value of one, then
+ * use dl_matrixq_check_sanity on the result. If this indicates clipping, lower the shift value. 
+ * If it indicates bits are under-used, increase it. Note that for adding and subtraction, only
+ * shift values of 0 or 1 make sense; these routines will error out if you try to do something
+ * else.
+ *
+ * For neural networks and other noise-tolerant applications, note that even when 
+ * dl_matrixq_check_sanity does not indicate any problems, twiddling with the shift value may lead
+ * to slightly improved precision. Feel free to experiment.
+ **/
+
+
+/**
+ * @brief Allocate a matrix
+ *
+ * @param w     Width of the matrix
+ * @param h     Height of the matrix
+ * @return The matrix, or NULL if out of memory
+ */
+dl_matrix2dq_t *dl_matrixq_alloc(int w, int h);
+dl_matrix2dq_t *dl_matrixq_alloc_psram(int w, int h);
+/**
+ * @brief Convert a floating-point matrix to a quantized matrix
+ *
+ * @param m     Floating-point matrix to convert
+ * @param out   Quantized matrix to re-use. If NULL, allocate a new one.
+ * @Return The quantized version of the floating-point matrix
+ */
+dl_matrix2dq_t *dl_matrixq_from_matrix2d(const dl_matrix2d_t *m, dl_matrix2dq_t *out);
+
+/**
+ * TODO: DESCRIBE THIS FUNCTION
+ */
+dl_matrix2dq_t *dl_matrixq_from_matrix2d_by_qmf(const dl_matrix2d_t *m, dl_matrix2dq_t *out, int m_bit, int f_bit);
+
+
+/**
+ * @brief Convert a quantized matrix to a floating-point one.
+ *
+ * @param m     Floating-point matrix to convert
+ * @param out   Quantized matrix to re-use. If NULL, allocate a new one.
+ * @Return The quantized version of the floating-point matrix
+ **/
+dl_matrix2d_t *dl_matrix2d_from_matrixq(const dl_matrix2dq_t *m, dl_matrix2d_t *out);
+
+
+/**
+ * @brief Free a quantized matrix
+ * Frees the matrix structure and (if it doesn't have the DL_MF_FOREIGNDATA flag set) the m->items space as well.
+ *
+ * @param m     Matrix to free
+ */
+void dl_matrixq_free(dl_matrix2dq_t *m);
+
+/**
+ * @brief Zero out the matrix
+ * Sets all entries in the matrix to 0.
+ *
+ * @param m     Matrix to zero
+ */
+void dl_matrixq_zero(dl_matrix2dq_t *m);
+
+/**
+ * @brief Copy the matrix into psram
+ * Copy the matrix from flash or iram/psram into psram
+ *
+ * @param m     Matrix to copy
+ */
+dl_matrix2dq_t *dl_matrixq_copy_to_psram(const dl_matrix2dq_t *m);
+
+/**
+ * @brief Do a dotproduct of two quantized matrices : res=a.b, Result is a fixed-point matrix.
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ * @param shift Shift ratio
+ */
+void dl_matrixq_dot(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res, int shift);
+
+/**
+ * @brief Do a dotproduct of two quantized matrices: res=a.b, Result is a floating-point matrix.
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ */
+void dl_matrixq_dot_matrix_out(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2d_t *res);
+
+/**
+ * @brief Do a dotproduct of two quantized matrices : res=a.b. This always uses the simple & stupid C algo for the dot product.
+ *
+ * Result is a fixed-point matrix. 
+ *
+ * Use this only if you expect something is wrong with the accelerated routines that dl_matrixq_dot calls; this function can be
+ * much slower than dl_matrixq_dot .
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ * @param shift Shift ratio
+ */
+void dl_matrixq_dot_c_impl(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res, int shift);
+
+/**
+ * @brief Do a dotproduct of two quantized matrices : res=a.b. This always uses the simple & stupid C algo for the dot product. 
+ *
+ * Result is a floating-point matrix. 
+ *
+ * Use this only if you expect something is wrong with the accelerated routines that dl_matrixq_dot_matrix_out calls; this function can be
+ * much slower than dl_matrixq_dot_matrix_out.
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ */
+void dl_matrixq_dot_matrix_out_c_impl(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2d_t *res);
+
+/**
+ * @brief Do a dotproduct of a floating point and a quantized matrix. Result is a floating-point matrix.
+ *
+ * @param a     First multiplicand; float matrix
+ * @param b     Second multiplicand; quantized matrix
+ * @param res   Dotproduct data; float matrix. *Must* be a *different* matrix from a or b!
+ */
+void dl_matrix_matrixq_dot(const dl_matrix2d_t *a, const dl_matrix2dq_t *b, dl_matrix2d_t *res);
+
+
+/**
+ * @brief Print the contents of a quantized matrix to stdout. Used for debugging.
+ *
+ * @param a     The matrix to print.
+ */
+void dl_printmatrixq(const dl_matrix2dq_t *a);
+
+
+/**
+ * @brief Add a pair of quantizedmatrices item-by-item: res=a-b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Added data. Can be equal to a or b to overwrite that.
+ * @param shift Shift value. Only 0 or 1 makes sense here. <ToDo: check>
+ */
+void dl_matrixq_add(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res, int shift);
+
+/**
+ * @brief Generate a new matrix using a range of items from an existing matrix.
+ * When using this, the data of the new matrix is not allocated/copied but it re-uses a pointer
+ * to the existing data. Changing the data in the resulting matrix, as a result, will also change
+ * the data in the existing matrix that has been sliced.
+ *
+ * @Warning In contrast to the floating point equivalent of this function, the fixed-point version
+ * of this has the issue that as soon as the output exponent of one of the slices changes, the data
+ * in the sliced matrix gets corrupted (because the exponent of that matrix is still the same.) If you
+ * use this function, either treat the slices as read-only, or assume the sliced matrix contains
+ * garbage after modifying the data in one of the slices.
+ *
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @param in    Old matrix (with foreign data) to re-use. Passing NULL will allocate a new matrix.
+ * @return The resulting slice matrix, or NULL if out of memory
+ */
+dl_matrix2dq_t *dl_matrixq_slice(const dl_matrix2dq_t *src, int x, int y, int w, int h, dl_matrix2dq_t *in);
+
+/**
+ * @brief select a range of items from an existing matrix and flatten them into one dimension.
+ *
+ * @Warning The results are flattened in row-major order.
+ *   
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @param in    Old matrix to re-use. Passing NULL will allocate a new matrix.
+ * @return The resulting flatten matrix, or NULL if out of memory
+ */
+dl_matrix2dq_t *dl_matrixq_flatten(const dl_matrix2dq_t *src, int x, int y, int w, int h, dl_matrix2dq_t *in);
+
+/**
+ * @brief Subtract a quantized matrix from another, item-by-item: res=a-b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Subtracted data. Can be equal to a or b to overwrite that.
+ * @param shift Shift value. Only 0 or 1 makes sense here. <ToDo: check>
+ */
+void dl_matrixq_sub(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res, int shift);
+
+/**
+ * @brief Multiply a pair of quantized matrices item-by-item: res=a*b
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Multiplicated data. Can be equal to a or b to overwrite that matrix.
+ */
+void dl_matrixq_mul( dl_matrix2dq_t *a,  dl_matrix2dq_t *b, dl_matrix2dq_t *res);
+
+/**
+ * @brief Divide a pair of quantized matrices item-by-item: res=a/b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Divided data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrixq_div(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *out, int shift);
+
+/**
+ * @brief Check if two quantized matrices have the same shape, that is, the same amount of 
+ * rows and columns
+ *
+ * @param a     First of the two matrices to compare
+ * @param b     Second of the two matrices to compare
+ * @return true if the two matrices are shaped the same, false otherwise.
+ */
+int dl_matrixq_same_shape(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b);
+
+/**
+ * @brief Concatenate the rows of two quantized matrices into a new matrix
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @return A newly allocated quantized matrix with as values a|b
+ */
+dl_matrix2dq_t *dl_matrixq_concat(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b);
+
+/**
+ * @brief Add a constant to every item of the quantized matrix
+ *
+ * @param subj  Matrix to add the constant to
+ * @param add   The constant
+ */
+void dl_matrixq_add_const(dl_matrix2dq_t *subj, const fptp_t add, int shift);
+
+/**
+ * @brief Check the sanity of a quantized matrix
+ *
+ * Due to the nature of quantized matrices, depending on the calculations a quantized
+ * matrix is the result of and the shift values chosen in those calculations, a quantized
+ * matrix may have an exponent and mantissas that lead to a loss of precision, either because
+ * most significant mantissa bits are unused, or because a fair amount of mantissas are 
+ * clipped. This function checks if this is the case and will report a message to stdout
+ * if significant loss of precision is detected.
+ *
+ * @param m     The quantized matrix to check
+ * @param name  A string to be displayed in the message if the sanity check fails
+ * @return True if matrix is sane, false otherwise
+ **/
+
+int dl_matrixq_check_sanity(dl_matrix2dq_t *m, const char *name);
+
+/**
+ * @brief re-adjust the exponent of the matrix to fit the mantissa better
+ *
+ * This function will shift up all the data in the mantissas so there are no
+ * most-significant bits that are unused in all mantissas. It will also adjust
+ * the exponent to keep the actua values in the matrix the same.
+ *
+ * Some operations done on a matrix, especially operations that re-use the
+ * result of earlier operations done in the same way, can lead to the loss of
+ * data because the exponent of the quantized matrix is never re-adjusted. You
+ * can do that implicitely by calling this function.
+ *
+ * @param m     The matrix to re-adjust
+**/
+void dl_matrixq_readjust_exp(dl_matrix2dq_t *m);
+
+
+
+/**
+ * @brief Get the floating-point value of a specific item from the quantized matrix
+ *
+ * @param m     Matrix to access
+ * @param x     Column address
+ * @param y     Row address
+ * @return Value in that position
+ */
+fptp_t dl_matrixq_get(const dl_matrix2dq_t *m, const int x, const int y);
+
+/**
+ * @brief Set a specific item in the quantized matrix to the given 
+ * floating-point value
+ *
+ * @warning If the given value is more than the exponent in the quantized matrix
+ * allows for, all mantissas in the matrix will be shifted down to make the value
+ * 'fit'. If, however, the exponent is such that the value would result in a
+ * quantized mantissa of 0, nothing is done.
+ *
+ * @param m     Matrix to access
+ * @param x     Column address
+ * @param y     Row address
+ * @param val   Value to write to that position
+ */
+void dl_matrixq_set(dl_matrix2dq_t *m, const int x, const int y, fptp_t val);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32c3/dl_lib_matrixq8.h b/include/esp32c3/dl_lib_matrixq8.h
new file mode 100644
index 0000000..377df7c
--- /dev/null
+++ b/include/esp32c3/dl_lib_matrixq8.h
@@ -0,0 +1,80 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_MATRIXQ8_H
+#define DL_LIB_MATRIXQ8_H
+
+#include <stdint.h>
+#include "dl_lib_matrix.h"
+#include "dl_lib.h"
+#include "dl_lib_matrixq.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int8_t q8tp_t;
+
+typedef struct {
+    int w;
+    int h;
+    int stride; //Normally equals h, not w!
+    int flags;
+    int exponent; //The values in items should be multiplied by pow(2,exponent) to get the real values.
+    q8tp_t *itemq;
+} dl_matrix2dq8_t;
+
+#define DL_Q8TP_SHIFT 7
+#define DL_Q8TP_RANGE ((1<<DL_Q8TP_SHIFT)-1)
+#define DL_ITMQ8(m, x, y) m->itemq[(y)+(x)*m->stride]
+
+/**
+ * @brief Allocate a matrix
+ *
+ * @param w     Width of the matrix
+ * @param h     Height of the matrix
+ * @return The matrix, or NULL if out of memory
+ */
+dl_matrix2dq8_t *dl_matrixq8_alloc(int w, int h);
+
+/**
+ * @brief Free a quantized matrix
+ * Frees the matrix structure and (if it doesn't have the DL_MF_FOREIGNDATA flag set) the m->items space as well.
+ *
+ * @param m     Matrix to free
+ */
+void dl_matrixq8_free(dl_matrix2dq8_t *m);
+
+/**
+ * @brief Copy a quantized matrix
+ * Copy a quantized matrix from flash or iram/psram
+ *
+ * @param m     Matrix to copy
+ */
+dl_matrix2dq8_t *dl_matrixq8_copy_to_psram(const dl_matrix2dq8_t *m);
+
+/**
+ * @brief Convert a floating-point matrix to a quantized matrix
+ *
+ * @param m     Floating-point matrix to convert
+ * @param out   Quantized matrix to re-use. If NULL, allocate a new one.
+ * @Return The quantized version of the floating-point matrix
+ */
+dl_matrix2dq8_t *dl_matrixq8_from_matrix2d(const dl_matrix2d_t *m, dl_matrix2dq8_t *out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/include/esp32c3/esp_aec.h b/include/esp32c3/esp_aec.h
new file mode 100644
index 0000000..36de9c1
--- /dev/null
+++ b/include/esp32c3/esp_aec.h
@@ -0,0 +1,105 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_AEC_H_
+#define _ESP_AEC_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define USE_AEC_FFT                      // Not kiss_fft
+#define AEC_SAMPLE_RATE     16000        // Only Support 16000Hz
+#define AEC_FRAME_LENGTH_MS 32
+
+typedef struct aec_handle_t aec_handle_t;
+typedef enum {
+    AEC_MODE_SR_LOW_COST = 0,     // Low Cost AEC fro speech recognition
+    AEC_MODE_SR_HIGH_PERF = 1,    // High Perforamce AEC for speech recognition
+    AEC_MODE_VOIP_LOW_COST = 3,   // Low Cost AEC for voice communication
+    AEC_MODE_VOIP_HIGH_PERF = 4,  // High Perforamce AEC for voice communication
+} aec_mode_t;
+
+/**
+ * @brief Creates an instance to the AEC structure.
+ * Please get frame size by aec_get_chunksize() function
+ * 
+ * @param sample_rate       The Sampling frequency (Hz) must be 16000.
+ * @param filter_length     Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
+ * @param channel_num       The input microphone channel number
+ * @param mode              The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of AEC
+ */
+aec_handle_t *aec_create(int sample_rate, int filter_length, int channel_num, aec_mode_t mode);
+
+/**
+ * @brief Creates an instance to the AEC structure, same with aec_create().
+ * 
+ * @param filter_length     Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
+ * @param channel_num       The input microphone channel number
+ * @param mode              The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of AEC
+ */
+aec_handle_t *aec_pro_create(int filter_length, int channel_num, aec_mode_t mode);
+
+/**
+ * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
+ *
+ * @warning The indata, refdata and outdata must be 16-bit signed. please allocate memory by heap_caps_aligned_alloc().
+ * 
+ * @param inst        The instance of AEC. Format for multi-channel data is "ch0 ch0 ch0 ..., ch1 ch1 ch1 ..."
+ * @param indata      An array of 16-bit signed audio samples from mic.
+ * @param refdata     An array of 16-bit signed audio samples sent to the speaker.
+ * @param outdata     Returns near-end signal with echo removed. Format for multi-channel data is "ch0 ch0 ch0..., ch1 ch1 ch1 ..."
+ * @return None
+ *
+ */
+void aec_process(const aec_handle_t *handel, int16_t *indata, int16_t *refdata, int16_t *outdata);
+
+/**
+ * @brief Get frame size of AEC (the samples of one frame)
+ * @param handle The instance of AEC.
+ * @return Frame size
+ */
+int aec_get_chunksize(const aec_handle_t *handle);
+
+/**
+ * @brief Get AEC mode string 
+ * 
+ * @param aec_mode  The mode of AEC.
+ * 
+ * @return AEC mode string
+ */
+char * aec_get_mode_string(aec_mode_t aec_mode);
+
+/**
+ * @brief Free the AEC instance
+ *
+ * @param inst The instance of AEC.
+ *
+ * @return None
+ *
+ */
+void aec_destroy(aec_handle_t *handel);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_AEC_H_
diff --git a/include/esp32c3/esp_afe_aec.h b/include/esp32c3/esp_afe_aec.h
new file mode 100644
index 0000000..9d60588
--- /dev/null
+++ b/include/esp32c3/esp_afe_aec.h
@@ -0,0 +1,82 @@
+
+#ifndef _ESP_AFE_AEC_H_
+#define _ESP_AFE_AEC_H_
+
+
+#include "esp_afe_config.h"
+#include "esp_aec.h"
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    aec_handle_t* handle;
+    aec_mode_t mode;
+    afe_pcm_config_t pcm_config;
+    int frame_size;
+    int16_t  *data;
+}afe_aec_handle_t;
+
+
+/**
+ * @brief Creates an instance to the AEC structure. 
+ * 
+ * @warning Currently only support 1 microphone channel and 1 playback channe. 
+ * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
+ *
+ * The input format, same as afe config:
+ * M to represent the microphone channel
+ * R to represent the playback reference channel
+ * N to represent an unknown or unused channel
+ *
+ * For example, input_format="MMNR" indicates that the input data consists of four channels,
+ * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
+ *
+ * @param input_format     The input format
+ * @param filter_length    The length of filter. The larger the filter, the higher the CPU loading.
+ *                         Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
+ * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
+ * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+ *
+ * @return afe_config_t*  The default config of afe
+ */
+afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);
+
+
+/**
+ * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
+ * 
+ * @param inst        The instance of AEC.
+ * @param indata      Input audio data, format is define by input_format. Note indata will be modified in function call.
+ * @param outdata     Returns near-end signal with echo removed. 
+
+ * @return The bytes of outdata.
+ */
+size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
+
+/**
+ * @brief Get frame size of AEC (the samples of one frame)
+ * @param handle The instance of AEC.
+ * @return Frame size
+ */
+int afe_aec_get_chunksize(afe_aec_handle_t *handle);
+
+
+/**
+ * @brief Free the AEC instance
+ *
+ * @param inst The instance of AEC.
+ *
+ * @return None
+ *
+ */
+void afe_aec_destroy(afe_aec_handle_t *handel);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_AEC_H_
diff --git a/include/esp32c3/esp_afe_config.h b/include/esp32c3/esp_afe_config.h
new file mode 100644
index 0000000..f9de6fe
--- /dev/null
+++ b/include/esp32c3/esp_afe_config.h
@@ -0,0 +1,69 @@
+#pragma once
+#include "esp_aec.h"
+#include "stdbool.h"
+#include "stdint.h"
+#include "stdlib.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// AFE: Audio Front-End
+// SR:  Speech Recognition
+// VC:  Voice Communication
+
+// Set AFE_SR mode
+typedef enum {
+    SR_MODE_LOW_COST = 0,  // Deprecated, please use afe_mode_t, AFE mode: low cost mode
+    SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode
+} afe_sr_mode_t;
+
+// Set AFE mode
+typedef enum {
+    AFE_MODE_LOW_COST = 0,  // AFE mode: low cost mode
+    AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
+} afe_mode_t;
+
+// Set AFE type
+typedef enum {
+    AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
+    AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
+} afe_type_t;
+
+typedef enum {
+    AFE_MEMORY_ALLOC_MORE_INTERNAL = 1,          // malloc with more internal ram
+    AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
+    AFE_MEMORY_ALLOC_MORE_PSRAM = 3              // malloc with more psram
+} afe_memory_alloc_mode_t;
+
+typedef enum {
+    AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
+    AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
+    AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
+    AFE_MN_PEAK_NO_AGC = 0,      // There is no agc gain
+} afe_mn_peak_agc_mode_t;
+
+typedef struct {
+    int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
+    int mic_num;      // microphone channel number
+    uint8_t *mic_ids; // microphone channel indices
+    int ref_num;      // playback reference channel number
+    uint8_t *ref_ids; // playback reference channel indices
+    int sample_rate;  // sample rate of audio
+} afe_pcm_config_t;
+
+typedef enum {
+    AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
+    AFE_NS_MODE_NET = 1,    // please use model name of NSNET
+} afe_ns_mode_t;
+
+typedef enum {
+    AFE_AGC_MODE_WEBRTC = 0,  // WEBRTC AGC
+    AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
+} afe_agc_mode_t;
+
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/include/esp32c3/esp_mfcc_fbank_int16.h b/include/esp32c3/esp_mfcc_fbank_int16.h
new file mode 100644
index 0000000..22a5f2c
--- /dev/null
+++ b/include/esp32c3/esp_mfcc_fbank_int16.h
@@ -0,0 +1,86 @@
+#pragma once
+#include "esp_speech_features.h"
+#include <stdint.h>
+
+/*
+This describes an interface for a MFCC runner, that is, some kind of implementation that can be
+fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so
+multiple implementations can be used.
+*/
+
+typedef struct esp_mfcc_data_t esp_mfcc_data_t;
+
+// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features),
+// please refer to its documentation for details.
+typedef struct {
+    int winstep_ms; // The step between successive windows in ms. (10)
+    int winlen_ms;  // The length of the analysis window in ms. (25)
+    int nch;        // The number of input channel
+    int numcep;     // The number of cepstrum to return
+    int nfilter;    // The number of filters in the filterbank
+    int nfft;       // The FFT size
+    int samp_freq;  // The sample-rate of the signal.
+    int low_freq;   // The lowest band edge of mel filters, in hz. (e.g. 0)
+    int high_freq;  // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
+    float preemph;  // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
+    char *win_type; // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
+    bool append_energy; // 　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
+    bool use_power;     // If true, use power of fft spectrum, else use magnitude of fft spectrum
+    int use_log_fbank;  // 0: return fbank, 1:  return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
+    float log_epsilon;  // log epsilon. (e.g. 1e-7)
+    bool psram_first;   // Alloc memory from PSRAM first
+    bool remove_dc_offset; // Whether to subtract mean of wave before FFT
+} esp_mfcc_opts_t;
+
+/**
+ * @brief Un-initialize and free a mfcc runner
+ *
+ * Function to free a previously allocated mfcc runner.
+ *
+ * @param r Runner object to destroy
+ */
+typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Initialize parameters for a mfcc runner.
+ *
+ * After creation, a mfcc runner needs to be initialized first; this is usually done
+ * in the initialization routine of a speech recognition algorithm. This provides
+ * a pointer to do this for a specific mfcc runner.
+ *
+ * @param opt Options for the mfcc process
+ * @return True if success, false on error.
+ */
+typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
+
+/**
+ * @brief Run a mfcc iteration on frame by frame
+ *
+ * This will take a set of samples and return a ceptrum. Note that this may be pipelined:
+ * an initial call to this function may return NULL and subsequent calls may return the
+ * cepstrum of previous calls.
+ *
+ * @param r The mfcc runner
+ * @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000).
+ * @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function
+ *         when done with this buffer. Note that some implementations require the buffer to be freed before another call
+ *         to this function is done.
+ */
+typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
+
+/**
+ * @brief Clean all state of mfcc handle
+ *
+ * @param r The mfcc runner
+ */
+typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Operations possible on a mfcc runner
+ */
+typedef struct {
+    esp_mfcc_op_destroy_t destroy;
+    esp_mfcc_op_create_t create;
+    esp_mfcc_op_run_step_t run_step;
+    esp_mfcc_op_clean_t clean;
+} esp_mfcc_iface_t;
diff --git a/include/esp32c3/esp_mfcc_iface.h b/include/esp32c3/esp_mfcc_iface.h
new file mode 100644
index 0000000..0257768
--- /dev/null
+++ b/include/esp32c3/esp_mfcc_iface.h
@@ -0,0 +1,89 @@
+#pragma once
+#include "esp_speech_features.h"
+#include <stdint.h>
+
+/*
+This describes an interface for a MFCC runner, that is, some kind of implementation that can be
+fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so
+multiple implementations can be used.
+*/
+
+typedef struct esp_mfcc_data_t esp_mfcc_data_t;
+
+// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features),
+// please refer to its documentation for details.
+typedef struct {
+    int winstep_ms; // The step between successive windows in ms. (10)
+    int winlen_ms;  // The length of the analysis window in ms. (25)
+    int nch;        // The number of input channel
+    int numcep;     // The number of cepstrum to return
+    int nfilter;    // The number of filters in the filterbank
+    int nfft;       // The FFT size
+    int samp_freq;  // The sample-rate of the signal.
+    int low_freq;   // The lowest band edge of mel filters, in hz. (e.g. 0)
+    int high_freq;  // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
+    float preemph;  // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
+    char *win_type; // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
+    bool append_energy; // 　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
+    bool use_power;     // If true, use power of fft spectrum, else use magnitude of fft spectrum
+    int use_log_fbank;  // 0: return fbank, 1:  return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
+    float log_epsilon;  // log epsilon. (e.g. 1e-7)
+    bool psram_first;   // Alloc memory from PSRAM first
+    bool remove_dc_offset; // Whether to subtract mean of wave before FFT
+} esp_mfcc_opts_t;
+
+/**
+ * @brief Un-initialize and free a mfcc runner
+ *
+ * Function to free a previously allocated mfcc runner.
+ *
+ * @param r Runner object to destroy
+ */
+typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Initialize parameters for a mfcc runner.
+ *
+ * After creation, a mfcc runner needs to be initialized first; this is usually done
+ * in the initialization routine of a speech recognition algorithm. This provides
+ * a pointer to do this for a specific mfcc runner.
+ *
+ * @param opt Options for the mfcc process
+ * @return True if success, false on error.
+ */
+typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
+
+/**
+ * @brief Run a mfcc iteration on frame by frame
+ *
+ * This will take a set of samples and return a ceptrum. Note that this may be pipelined:
+ * an initial call to this function may return NULL and subsequent calls may return the
+ * cepstrum of previous calls.
+ *
+ * @param r The mfcc runner
+ * @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000).
+ * @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function
+ *         when done with this buffer. Note that some implementations require the buffer to be freed before another call
+ *         to this function is done.
+ */
+typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
+
+typedef void (*esp_mfcc_op_run_step_s16_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t *fbank);
+
+/**
+ * @brief Clean all state of mfcc handle
+ *
+ * @param r The mfcc runner
+ */
+typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Operations possible on a mfcc runner
+ */
+typedef struct {
+    esp_mfcc_op_destroy_t destroy;
+    esp_mfcc_op_create_t create;
+    esp_mfcc_op_run_step_t run_step;
+    esp_mfcc_op_run_step_s16_t run_step_s16;
+    esp_mfcc_op_clean_t clean;
+} esp_mfcc_iface_t;
diff --git a/include/esp32c3/esp_mfcc_models.h b/include/esp32c3/esp_mfcc_models.h
new file mode 100644
index 0000000..44086e8
--- /dev/null
+++ b/include/esp32c3/esp_mfcc_models.h
@@ -0,0 +1,44 @@
+#pragma once
+#include "esp_mfcc_iface.h"
+
+extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle
+extern const esp_mfcc_iface_t esp_fbank_s16; // int16-fbank handle
+
+/**
+ * @brief Return basic opts used in wakenet9 & multinet5
+ **/
+esp_mfcc_opts_t *get_mfcc_opts_wn9();
+
+/**
+ * @brief Return basic opts used in wakenet9s
+ **/
+esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
+
+/**
+ * @brief Return basic opts for default kaldifeat
+ *
+    opts->psram_first = true;
+    opts->use_power = true;
+    opts->use_log_fbank = 2;  // log(max(x, log_epsilon))
+    opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps
+    opts->win_type = "povey";
+    opts->low_freq = 20;
+    opts->high_freq = 7600;
+    opts->samp_freq = 16000;
+    opts->nch = 1;
+    opts->nfft = 512;
+    opts->nfilter = 80;
+    opts->numcep = 80;
+    opts->preemph = 0.97;
+    opts->append_energy = false;
+    opts->winlen_ms = 25;
+    opts->winstep_ms = 10;
+    opts->remove_dc_offset = true;
+ *
+ **/
+esp_mfcc_opts_t *get_mfcc_opts_kaldi();
+
+/**
+ * @brief Print mfcc opts
+ **/
+void print_mfcc_opts(esp_mfcc_opts_t *opts);
diff --git a/include/esp32c3/esp_speech_features.h b/include/esp32c3/esp_speech_features.h
new file mode 100644
index 0000000..c1659f9
--- /dev/null
+++ b/include/esp32c3/esp_speech_features.h
@@ -0,0 +1,62 @@
+#pragma once
+#include "c_speech_features_config.h"
+#include "stdlib.h"
+#include <assert.h>
+#include <stdbool.h>
+
+#ifndef M_2PI
+#define M_2PI 6.283185307179586476925286766559005
+#endif
+
+typedef struct {
+    float *coeff;
+    int *bank_pos;
+    int nfilter;
+} esp_mel_filter_t;
+
+float *esp_mfcc_malloc(size_t size, bool from_psram);
+
+void esp_mfcc_free(void *ptr);
+
+/**
+ * @brief Initialize FFT table
+ * @warning For ESP-PLATFORM, use esp-dsp fft
+ *          For Other platform, use kiss fft
+ *
+ * @param nfft  The input samples number
+ * @return fft-table
+ **/
+void *esp_fft_init(int nfft);
+
+/**
+ * @brief Free FFT table
+ * @warning For ESP-PLATFORM, use esp-dsp fft
+ *          For Other platform, use kiss fft
+ *
+ * @param fft_table  The fft table initialized by esp_fft_init
+ * @param nfft       The input samples number
+ * @return fft-table
+ **/
+void esp_fft_deinit(void *fft_table, int nfft);
+
+/**
+ * @brief Initial window function
+ *        Currently support hanning, hamming, sine, povey, rectangular,
+ *        wn9(512-hanning to get wakenet9& multinet5 compatible)
+ **/
+float *esp_win_func_init(char *win_type, float *window_data, int frame_length);
+
+float *esp_fftr(float *x, int nfft, void *fft_table);
+
+float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
+
+void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
+
+float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);
+
+esp_mel_filter_t *esp_mel_filter_init(
+    int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, bool from_psram);
+
+void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);
+
+float *esp_mel_dotprod_step(float *x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, float epsilon);
diff --git a/include/esp32c3/esp_wn_iface.h b/include/esp32c3/esp_wn_iface.h
new file mode 100644
index 0000000..44bab8d
--- /dev/null
+++ b/include/esp32c3/esp_wn_iface.h
@@ -0,0 +1,215 @@
+#pragma once
+#include "stdint.h"
+#include "dl_lib_convq_queue.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//Opaque model data container
+typedef struct model_iface_data_t model_iface_data_t;
+
+/**
+ * @brief The state of wakeup
+ */
+typedef enum
+{
+    WAKENET_NO_DETECT = 0,               // wake word is not detected
+    WAKENET_CHANNEL_VERIFIED = -1,       // output channel is verified
+    WAKENET_DETECTED = 1                 // wake word is detected
+} wakenet_state_t;
+
+//Set wake words recognition operating mode
+//The probability of being wake words is increased with increasing mode, 
+//As a consequence also the false alarm rate goes up
+typedef enum {
+	DET_MODE_90 = 0,       // Normal
+	DET_MODE_95 = 1,       // Aggressive
+    DET_MODE_2CH_90 = 2,
+    DET_MODE_2CH_95 = 3,
+    DET_MODE_3CH_90 = 4,
+    DET_MODE_3CH_95 = 5,
+} det_mode_t;
+
+typedef struct {
+    int wake_word_num;     //The number of all wake words
+    char **wake_word_list; //The name list of wake words  
+} wake_word_info_t;
+
+/**
+ * @brief Easy function type to initialze a model instance with a detection mode and specified wake word coefficient
+ *
+ * @param model_name  The specified wake word model coefficient
+ * @param det_mode    The wake words detection mode to trigger wake words, DET_MODE_90 or DET_MODE_95
+ * @returns Handle to the model data
+ */
+typedef model_iface_data_t* (*esp_wn_iface_op_create_t)(const void *model_name, det_mode_t det_mode);
+
+/**
+ * @brief Get the amount of samples that need to be passed to the detect function
+ *
+ * Every speech recognition model processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_wn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the channel number of samples that need to be passed to the detect function
+ *
+ * Every speech recognition model processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_wn_iface_op_get_channel_num_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the start point of wake word when one wake word is detected. 
+ * 
+ * @Warning: This function should be called when the channel index is verified. 
+ * The returned value is the number of samples from start point of wake word to detected point. 
+ * 
+ * @param model The model object to query
+ * @return The number of samples from start point to detected point (end point)
+ */
+typedef int (*esp_wn_iface_op_get_start_point_t)(model_iface_data_t *model);
+
+
+/**
+ * @brief Get the sample rate of the samples to feed to the detect function
+ *
+ * @param model The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_wn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the number of wake words
+ *
+ * @param model The model object to query
+ * @returns the number of wake words
+ */
+typedef int (*esp_wn_iface_op_get_word_num_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the name of wake word by index
+ *
+ * @Warning The index of wake word start with 1
+
+ * @param model The model object to query
+ * @param word_index The index of wake word
+ * @returns the detection threshold
+ */
+typedef char* (*esp_wn_iface_op_get_word_name_t)(model_iface_data_t *model, int word_index);
+
+/**
+ * @brief Set the detection threshold to manually abjust the probability 
+ *
+ * @param model The model object to query
+ * @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.5~0.9999
+ * @param word_index The index of wake word
+ * @return 0: setting failed, 1: setting success
+ */
+typedef int (*esp_wn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold, int word_index);
+
+/**
+ * @brief Get the wake word detection threshold of different modes
+ *
+ * @param model The model object to query
+ * @param word_index The index of wake word
+ * @returns the detection threshold
+ */
+typedef float (*esp_wn_iface_op_get_det_threshold_t)(model_iface_data_t *model, int word_index);
+
+/**
+ * @brief Feed samples of an audio stream to the keyword detection model and detect if there is a keyword found.
+ *
+ * @Warning The index of wake word start with 1, 0 means no wake words is detected.
+ *
+ * @param model The model object to query
+ * @param samples An array of 16-bit signed audio samples. The array size used can be queried by the 
+ *        get_samp_chunksize function.
+ * @return The index of wake words, return 0 if no wake word is detected, else the index of the wake words.
+ */
+typedef wakenet_state_t (*esp_wn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
+
+/**
+ * @brief Get the volume gain
+ *
+ * @param model The model object to query
+ * @param target_db  The target dB to calculate volume gain
+ * @returns the volume gain
+ */
+typedef float (*esp_wn_iface_op_get_vol_gain_t)(model_iface_data_t *model, float target_db);
+
+/**
+ * @brief Get the triggered channel index. Channel index starts from zero
+ *
+ * @param model The model object to query
+ * @return The channel index
+ */
+typedef int (*esp_wn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
+
+/**
+ * @brief Clean all states of model
+ *
+ * @param model The model object to query
+ */
+typedef void (*esp_wn_iface_op_clean_t)(model_iface_data_t *model);
+
+/**
+ * @brief Destroy a speech recognition model
+ *
+ * @param model Model object to destroy
+ */
+typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model);
+
+/**
+ * @brief Feed MFCC of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param cq An array of 16-bit MFCC.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef wakenet_state_t (*esp_wn_iface_op_detect_mfcc_t)(model_iface_data_t *model, int16_t *samples, dl_convq_queue_t *cq);
+
+/**
+ * @brief Get MFCC of an audio stream
+ *
+ * @param model The model object to query
+ * @return MFCC data
+ */
+typedef dl_convq_queue_t* (*esp_wn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
+
+
+/**
+ * This structure contains the functions used to do operations on a wake word detection model.
+ */
+typedef struct {
+    esp_wn_iface_op_create_t create;
+    esp_wn_iface_op_get_start_point_t get_start_point;
+    esp_wn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_wn_iface_op_get_channel_num_t get_channel_num;
+    esp_wn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_wn_iface_op_get_word_num_t get_word_num;
+    esp_wn_iface_op_get_word_name_t get_word_name;
+    esp_wn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_wn_iface_op_get_det_threshold_t get_det_threshold;
+    esp_wn_iface_op_get_triggered_channel_t  get_triggered_channel;
+    esp_wn_iface_op_get_vol_gain_t get_vol_gain;
+    esp_wn_iface_op_detect_t detect;
+    esp_wn_iface_op_detect_mfcc_t detect_mfcc;
+    esp_wn_iface_op_get_mfcc_data_t get_mfcc_data;
+    esp_wn_iface_op_clean_t clean;
+    esp_wn_iface_op_destroy_t destroy;
+} esp_wn_iface_t;
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/include/esp32c3/esp_wn_models.h b/include/esp32c3/esp_wn_models.h
new file mode 100644
index 0000000..3a4d7e4
--- /dev/null
+++ b/include/esp32c3/esp_wn_models.h
@@ -0,0 +1,52 @@
+#pragma once
+#include "esp_wn_iface.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The prefix of wakenet model name is used to filter all wakenet from availabel models.
+#define ESP_WN_PREFIX "wn"
+
+/**
+ * @brief Get the wakenet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of wakenet
+ */
+const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
+
+/**
+ * @brief Get the wake word name from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The wake word name, like "alexa","hilexin","xiaoaitongxue"
+ */
+char* esp_wn_wakeword_from_name(const char *model_name);
+
+#ifdef __cplusplus
+}
+#endif
+
+/*
+
+static const sr_model_iface_t *model = esp_wn_handle_from_name(model_name);
+
+//Initialize wakeNet model data
+static model_iface_data_t *model_data=model->create(model_name, DET_MODE_90);
+
+//Set parameters of buffer
+int audio_chunksize=model->get_samp_chunksize(model_data);
+int frequency = model->get_samp_rate(model_data);
+int16_t *buffer=malloc(audio_chunksize*sizeof(int16_t));
+
+//Detect
+int r=model->detect(model_data, buffer);
+if (r>0) {
+    printf("Detection triggered output %d.\n",  r);
+}
+
+//Destroy model
+model->destroy(model_data)
+
+*/
diff --git a/include/esp32c6/c_speech_features_config.h b/include/esp32c6/c_speech_features_config.h
new file mode 100644
index 0000000..e21e020
--- /dev/null
+++ b/include/esp32c6/c_speech_features_config.h
@@ -0,0 +1,29 @@
+#pragma once
+#include <float.h>
+#include <math.h>
+
+/* #undef ENABLE_DOUBLE */
+
+#ifdef ENABLE_DOUBLE
+# define csf_float double
+# define csf_ceil ceil
+# define csf_floor floor
+# define csf_sin sin
+# define csf_log log
+# define csf_log10 log10
+# define csf_pow pow
+# define csf_sqrt sqrt
+# define csf_abs fabs
+# define csf_float_min DBL_MIN
+#else
+# define csf_float float
+# define csf_ceil ceilf
+# define csf_floor floorf
+# define csf_sin sinf
+# define csf_log logf
+# define csf_log10 log10f
+# define csf_pow powf
+# define csf_sqrt sqrtf
+# define csf_abs fabsf
+# define csf_float_min FLT_MIN
+#endif
diff --git a/include/esp32c6/dl_lib.h b/include/esp32c6/dl_lib.h
new file mode 100644
index 0000000..47e7c86
--- /dev/null
+++ b/include/esp32c6/dl_lib.h
@@ -0,0 +1,418 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_H
+#define DL_LIB_H
+
+#include "dl_lib_matrix.h"
+#include "dl_lib_matrixq.h"
+#include "dl_lib_matrixq8.h"
+
+#ifdef ESP_PLATFORM
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+#include "freertos/queue.h"
+#include "esp_system.h"
+#include "esp_heap_caps.h"
+#include "sdkconfig.h"
+#define DL_SPIRAM_SUPPORT 1
+#endif
+
+#ifdef CONFIG_IDF_TARGET_ESP32S3
+#include "esp32s3/rom/cache.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int padding_state;
+
+// /**
+//  * @brief Allocate a chunk of memory which has the given capabilities.
+//  *        Equivalent semantics to libc malloc(), for capability-aware memory.
+//  *        In IDF, malloc(p) is equivalent to heap_caps_malloc(p, MALLOC_CAP_8BIT).
+//  * 
+//  * @param size  In bytes, of the amount of memory to allocate
+//  * @param caps  Bitwise OR of MALLOC_CAP_* flags indicating the type of memory to be returned
+//  *              MALLOC_CAP_SPIRAM:   Memory must be in SPI RAM
+//  *              MALLOC_CAP_INTERNAL: Memory must be internal; specifically it should not disappear when flash/spiram cache is switched off
+//  *              MALLOC_CAP_DMA:      Memory must be able to accessed by DMA
+//  *              MALLOC_CAP_DEFAULT:  Memory can be returned in a non-capability-specific memory allocation
+//  * @return Pointer to currently allocated heap memory
+//  **/
+// void *heap_caps_malloc(size_t size, uint32_t caps);
+
+/**
+ * @brief Allocate aligned memory from internal memory or external memory.
+ *        if cnt*size > CONFIG_SPIRAM_MALLOC_ALWAYSINTERNAL, allocate memory from internal RAM
+ *        else, allocate memory from PSRAM
+ *
+ * @param cnt    Number of continuing chunks of memory to allocate
+ * @param size   Size, in bytes, of a chunk of memory to allocate     
+ * @param align  Aligned size, in bits
+ * @return Pointer to currently allocated heap memory
+ */
+void *dl_lib_calloc(int cnt, int size, int align);
+
+/**
+ * @brief Always allocate aligned memory from external memory.
+ *
+ * @param cnt    Number of continuing chunks of memory to allocate
+ * @param size   Size, in bytes, of a chunk of memory to allocate     
+ * @param align  Aligned size, in bits
+ * @return Pointer to currently aligned heap memory
+ */
+void *dl_lib_calloc_psram(int cnt, int size, int align);
+
+/**
+ * @brief Free aligned memory allocated by `dl_lib_calloc` or `dl_lib_calloc_psram` 
+ * 
+ * @param ptr    Pointer to free
+ */
+void dl_lib_free(void *ptr);
+
+/**
+ * @brief Does a fast version of the exp() operation on a floating point number.
+ *
+ * As described in https://codingforspeed.com/using-faster-exponential-approximation/
+ * Should be good til an input of 5 or so with a steps factor of 8.
+ *
+ * @param in Floating point input
+ * @param steps Approximation steps. More is more precise. 8 or 10 should be good enough for most purposes.
+ * @return Exp()'ed output
+ */
+fptp_t fast_exp(double x, int steps);
+
+/**
+ * @brief Does a fast version of the exp() operation on a floating point number.
+ *
+ * @param in Floating point input
+ * @return Exp()'ed output
+ */
+double fast_exp_pro(double x);
+
+/**
+ * @brief Does a softmax operation on a matrix.
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_softmax(const dl_matrix2d_t *in, dl_matrix2d_t *out);
+
+
+/**
+ * @brief Does a softmax operation on a quantized matrix.
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_softmax_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+/**
+ * @brief Does a sigmoid operation on a floating point number
+ *
+ * @param in Floating point input
+ * @return Sigmoid output
+ */
+
+fptp_t dl_sigmoid_op(fptp_t in);
+
+
+/**
+ * @brief Does a sigmoid operation on a matrix.
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_sigmoid(const dl_matrix2d_t *in, dl_matrix2d_t *out);
+
+/**
+ * @brief Does a tanh operation on a floating point number
+ *
+ * @param in        Floating point input number
+ * @return Tanh value
+ */
+fptp_t dl_tanh_op(fptp_t v);
+
+/**
+ * @brief Does a tanh operation on a matrix.
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_tanh(const dl_matrix2d_t *in, dl_matrix2d_t *out);
+
+
+/**
+ * @brief Does a relu (Rectifier Linear Unit) operation on a floating point number
+ *
+ * @param in        Floating point input
+ * @param clip      If value is higher than this, it will be clipped to this value
+ * @return Relu output
+ */
+fptp_t dl_relu_op(fptp_t in, fptp_t clip);
+
+/**
+ * @brief Does a ReLu operation on a matrix.
+ *
+ * @param in        Input matrix
+ * @param clip      If values are higher than this, they will be clipped to this value
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_relu(const dl_matrix2d_t *in, fptp_t clip, dl_matrix2d_t *out);
+
+/**
+ * @brief Fully connected layer operation
+ *
+ * @param in        Input vector
+ * @param weight    Weights of the neurons
+ * @param bias      Biases for the neurons. Can be NULL if a bias of 0 is required.
+ * @param out       Output array. Outputs are placed here. Needs to be an initialized, weight->w by in->h in size, matrix.
+ */
+void dl_fully_connect_layer(const dl_matrix2d_t *in, const dl_matrix2d_t *weight, const dl_matrix2d_t *bias, dl_matrix2d_t *out);
+
+/**
+ * @brief Pre-calculate the sqrtvari variable for the batch_normalize function.
+ * The sqrtvari matrix depends on the variance and epsilon values, which normally are constant. Hence,
+ * this matrix only needs to be calculated once. This function does that.
+ *
+ * @param 
+ * @return
+ */
+void dl_batch_normalize_get_sqrtvar(const dl_matrix2d_t *variance, fptp_t epsilon, dl_matrix2d_t *out);
+
+/**
+ * @brief Batch-normalize a matrix
+ *
+ * @param m         The matrix to normalize
+ * @param offset    Offset matrix
+ * @param scale     Scale matrix
+ * @param mean      Mean matrix
+ * @param sqrtvari  Matrix precalculated using dl_batch_normalize_get_sqrtvar
+ * @return
+ */
+void dl_batch_normalize(dl_matrix2d_t *m, const dl_matrix2d_t *offset, const dl_matrix2d_t *scale, 
+                        const dl_matrix2d_t *mean, const dl_matrix2d_t *sqrtvari);
+
+/**
+ * @brief Do a basic LSTM layer pass.
+ *
+ * @warning Returns state_h pointer, so do not free result.
+
+ * @param in        Input vector
+ * @param state_c   Internal state of the LSTM network
+ * @param state_h   Internal state (previous output values) of the LSTM network
+ * @param weights   Weights for the neurons
+ * @param bias      Bias for the neurons. Can be NULL if no bias is required
+ * @return          Output values of the neurons
+ */
+dl_matrix2d_t *dl_basic_lstm_layer(const dl_matrix2d_t *in, dl_matrix2d_t *state_c, dl_matrix2d_t *state_h, 
+                const dl_matrix2d_t *weight, const dl_matrix2d_t *bias);
+
+/**
+ * @brief Do a basic LSTM layer pass, partial quantized version.
+ * This LSTM function accepts 16-bit fixed-point weights and 32-bit float-point bias. 
+ *
+ * @warning Returns state_h pointer, so do not free result.
+
+ * @param in		Input vector
+ * @param state_c	Internal state of the LSTM network
+ * @param state_h	Internal state (previous output values) of the LSTM network
+ * @param weights	Weights for the neurons, need to be quantised 
+ * @param bias		Bias for the neurons. Can be NULL if no bias is required
+ * @return			Output values of the neurons
+ */
+dl_matrix2dq_t *dl_basic_lstm_layer_quantised_weights(const dl_matrix2d_t *in, dl_matrix2d_t *state_c, dl_matrix2d_t *state_h,
+				const dl_matrix2dq_t *weight, const dl_matrix2d_t *bias);
+
+/**
+ * @brief Do a fully-connected layer pass, fully-quantized version.
+ *
+ * @param in        Input vector
+ * @param weight    Weights of the neurons
+ * @param bias      Bias values of the neurons. Can be NULL if no bias is needed.
+ * @param shift     Number of bits to shift the result back by. See dl_lib_matrixq.h for more info
+ * @return          Output values of the neurons
+ */
+void dl_fully_connect_layer_q(const dl_matrix2dq_t *in, const dl_matrix2dq_t *weight, const dl_matrix2dq_t *bias, dl_matrix2dq_t *out, int shift);
+
+/**
+ * @brief Do a basic LSTM layer pass, fully-quantized version
+ *
+ * @warning Returns state_h pointer, so do not free result.
+
+ * @param in        Input vector
+ * @param state_c   Internal state of the LSTM network
+ * @param state_h   Internal state (previous output values) of the LSTM network
+ * @param weights   Weights for the neurons
+ * @param bias      Bias for the neurons. Can be NULL if no bias is required
+ * @param shift     Number of bits to shift the result back by. See dl_lib_matrixq.h for more info
+ * @return          Output values of the neurons
+ */
+dl_matrix2dq_t *dl_basic_lstm_layer_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *state_c, dl_matrix2dq_t *state_h,
+                const dl_matrix2dq_t *weight, const dl_matrix2dq_t *bias, int shift);
+
+/**
+ * @brief Batch-normalize a matrix, fully-quantized version
+ *
+ * @param m         The matrix to normalize
+ * @param offset    Offset matrix
+ * @param scale     Scale matrix
+ * @param mean      Mean matrix
+ * @param sqrtvari  Matrix precalculated using dl_batch_normalize_get_sqrtvar
+ * @param shift     Number of bits to shift the result back by. See dl_lib_matrixq.h for more info
+ * @return
+ */
+void dl_batch_normalize_q(dl_matrix2dq_t *m, const dl_matrix2dq_t *offset, const dl_matrix2dq_t *scale, 
+                        const dl_matrix2dq_t *mean, const dl_matrix2dq_t *sqrtvari, int shift);
+
+/**
+ * @brief Does a relu (Rectifier Linear Unit) operation on a fixed-point number
+ * This accepts and returns fixed-point 32-bit number with the last 15 bits being the bits after the decimal
+ * point. (Equivalent to a mantissa in a quantized matrix with exponent -15.)
+ *
+ * @param in        Fixed-point input
+ * @param clip      If value is higher than this, it will be clipped to this value
+ * @return Relu output
+ */
+qtp_t dl_relu_q_op(qtp_t in, qtp_t clip);
+
+/**
+ * @brief Does a ReLu operation on a matrix, quantized version
+ *
+ * @param in        Input matrix
+ * @param clip      If values are higher than this, they will be clipped to this value
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_relu_q(const dl_matrix2dq_t *in, fptp_t clip, dl_matrix2dq_t *out);
+
+/**
+ * @brief Does a sigmoid operation on a fixed-point number.
+ * This accepts and returns a fixed-point 32-bit number with the last 15 bits being the bits after the decimal
+ * point. (Equivalent to a mantissa in a quantized matrix with exponent -15.)
+ *
+ * @param in Fixed-point input
+ * @return Sigmoid output
+ */
+int dl_sigmoid_op_q(const int in);
+int16_t dl_sigmoid_op_q8(const int16_t in);
+/**
+ * @brief Does a sigmoid operation on a matrix, quantized version
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_sigmoid_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+/**
+ * @brief Does a tanh operation on a matrix, quantized version
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_tanh_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+/**
+ * @brief Does a tanh operation on a fixed-point number.
+ * This accepts and returns a fixed-point 32-bit number with the last 15 bits being the bits after the decimal
+ * point. (Equivalent to a mantissa in a quantized matrix with exponent -15.)
+ *
+ * @param in Fixed-point input
+ * @return tanh output
+ */
+int dl_tanh_op_q(int v);
+int16_t dl_tanh_op_q8(int16_t v);
+
+void load_mat_psram_mn4(void);
+void load_mat_psram_mn3(void);
+void free_mat_psram_mn4(void);
+void free_mat_psram_mn3(void);
+qtp_t dl_hard_sigmoid_op(qtp_t in, int exponent);
+qtp_t dl_hard_tanh_op(qtp_t in, int exponent);
+
+int16_t dl_table_tanh_op(int16_t in, int exponent);
+int16_t dl_table_sigmoid_op(int16_t in, int exponent);
+
+void dl_hard_sigmoid_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+void dl_hard_tanh_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+void dl_table_sigmoid_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+void dl_table_tanh_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+
+/**
+ * @brief Filter out the number greater than clip in the matrix, quantized version
+ *
+ * @param in        Input matrix
+ * @param clip      If values are higher than this, they will be clipped to this value
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_minimum(const dl_matrix2d_t *in, fptp_t clip, dl_matrix2d_t *out);
+
+/**
+ * @brief Filter out the number greater than clip in the matrix, float version
+ *
+ * @param in        Input matrix
+ * @param clip      If values are higher than this, they will be clipped to this value
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_minimum_q(const dl_matrix2dq_t *in, fptp_t clip, dl_matrix2dq_t *out);
+/**
+ * @brief Do a basic CNN layer pass.
+ *
+ * @Warning This just supports the single channel input image, and the output is single row matrix.
+            That is to say, the height of output is 1, and the weight of output is out_channels*out_image_width*out_image_height
+ *
+ * @param in             Input single channel image 
+ * @param weight         Weights of the neurons, weight->w = out_channels, weight->h = filter_width*filter_height
+ * @param bias           Bias for the CNN layer.
+ * @param filter_height  The height of convolution kernel
+ * @param filter_width   The width of convolution kernel
+ * @param out_channels   The number of output channels of convolution kernel
+ * @param stride_x       The step length of the convolution window in x(width) direction
+ * @param stride_y       The step length of the convolution window in y(height) direction
+ * @param pad            One of `"VALID"` or `"SAME"`, 0 is "VALID" and the other is "SAME"
+ * @param out            The result of CNN layer, out->h=1.
+ * @return               The result of CNN layer.
+ */
+dl_matrix2d_t *dl_basic_conv_layer(const dl_matrix2d_t *in, const dl_matrix2d_t *weight, const dl_matrix2d_t *bias, int filter_width, int filter_height, 
+                                   const int out_channels, const int stride_x, const int stride_y,  padding_state pad, const dl_matrix2d_t* out);
+
+
+/**
+ * @brief Do a basic CNN layer pass, quantised wersion.
+ *
+ * @Warning This just supports the single channel input image, and the output is single row matrix.
+            That is to say, the height of output is 1, and the weight of output is out_channels*out_image_width*out_image_height
+ *
+ * @param in             Input single channel image 
+ * @param weight         Weights of the neurons, weight->w = out_channels, weight->h = filter_width*filter_height,
+ * @param bias           Bias of the neurons.
+ * @param filter_height  The height of convolution kernel
+ * @param filter_width   The width of convolution kernel
+ * @param out_channels   The number of output channels of convolution kernel
+ * @param stride_x       The step length of the convolution window in x(width) direction
+ * @param stride_y       The step length of the convolution window in y(height) direction
+ * @param pad            One of `"VALID"` or `"SAME"`, 0 is "VALID" and the other is "SAME"
+ * @param out            The result of CNN layer, out->h=1
+ * @return               The result of CNN layer
+ */
+dl_matrix2d_t *dl_basic_conv_layer_quantised_weight(const dl_matrix2d_t *in, const dl_matrix2dq_t *weight, const dl_matrix2d_t *bias, int filter_width, int filter_height, 
+                                                     const int out_channels, const int stride_x, const int stride_y,  padding_state pad, const dl_matrix2d_t* out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32c6/dl_lib_coefgetter_if.h b/include/esp32c6/dl_lib_coefgetter_if.h
new file mode 100644
index 0000000..a21de8d
--- /dev/null
+++ b/include/esp32c6/dl_lib_coefgetter_if.h
@@ -0,0 +1,80 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_COEFGETTER_IF_H
+#define DL_LIB_COEFGETTER_IF_H
+
+#include "dl_lib_matrix.h"
+#include "dl_lib_matrixq.h"
+#include "dl_lib_matrixq8.h"
+#include "cJSON.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//Set this if the coefficient requested is a batch-normalization popvar matrix which needs to be preprocessed by
+//dl_batch_normalize_get_sqrtvar first.
+#define COEF_GETTER_HINT_BNVAR (1<<0)
+
+/*
+This struct describes the basic information of model data: 
+word_num: the number of wake words or speech commands
+word_list: the name list of wake words or speech commands
+thres_list: the threshold list of wake words or speech commands
+info_str: the string used to reflect the version and information of model data
+          which consist of the architecture of network, the version of model data, wake words and their threshold
+*/
+typedef struct {
+    int word_num;
+    char **word_list;
+    int *win_list;
+    float *thresh_list;
+    char *info_str;
+} model_info_t;
+
+/*
+Alphabet struct describes the basic grapheme or phoneme.
+item_num: the number of baisc item(grapheme or phonemr)
+items: the list of basic item
+*/
+typedef struct {
+    int item_num;
+    char **items;
+}alphabet_t;
+
+/*
+This struct describes a generic coefficient getter: a way to get the constant coefficients needed for a neural network.
+For the two getters, the name describes the name of the coefficient matrix, usually the same as the Numpy filename the
+coefficient was originally stored in. The arg argument can be used to optionally pass an additional user-defined argument
+to the getter (e.g. the directory to look for files in the case of the Numpy file loader getter). The hint argument
+is a bitwise OR of the COEF_GETTER_HINT_* flags or 0 when none is needed. Use the free_f/free_q functions to release the
+memory for the returned matrices, when applicable.
+*/
+typedef struct {
+    const dl_matrix2d_t* (*getter_f)(const char *name, void *arg, int hint);
+    const dl_matrix2dq_t* (*getter_q)(const char *name, void *arg, int hint);
+    const dl_matrix2dq8_t* (*getter_q8)(const char *name, void *arg, int hint);
+    void (*free_f)(const dl_matrix2d_t *m);
+    void (*free_q)(const dl_matrix2dq_t *m);
+    void (*free_q8)(const dl_matrix2dq8_t *m);
+    const model_info_t* (*getter_info)(void *arg);
+    const alphabet_t* (*getter_alphabet)(void *arg);
+    const cJSON* (*getter_config)(void *arg);
+} model_coeff_getter_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32c6/dl_lib_conv_queue.h b/include/esp32c6/dl_lib_conv_queue.h
new file mode 100644
index 0000000..7cb9bf9
--- /dev/null
+++ b/include/esp32c6/dl_lib_conv_queue.h
@@ -0,0 +1,180 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_CONV_QUEUE_H
+#define DL_LIB_CONV_QUEUE_H
+
+
+#include "dl_lib_matrix.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef float fptp_t;
+
+//Flags for matrices
+// #define DL_MF_FOREIGNDATA (0)  /*< Matrix *item data actually points to another matrix and should not be freed */
+
+//Float convolution FIFO queue. 
+typedef struct {
+    int n;          /*< the length of queue */
+    int c;          /*< the channel number of queue element*/
+    int front;      /*< the front(top) position of queue */
+    int flag;       /*< not used*/
+    fptp_t *item;   /*< Pointer to item array */
+} dl_conv_queue_t;
+
+/**
+ * @brief Allocate a convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The channel number of elements in the queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_conv_queue_t *dl_conv_queue_alloc(int n, int c);
+
+/**
+ * @brief Allocate a convolution queue from psram
+ *
+ * @param n     The length of queue
+ * @param c     The channel number of elements in the queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_conv_queue_t *dl_conv_queue_alloc_from_psram(int n, int c);
+
+/**
+ * @brief Free a convolution queue
+ *
+ * @param cq     The convolution queue to free
+ */
+void dl_conv_queue_free(dl_conv_queue_t *cq);
+
+void dl_conv_to_matrix2d(dl_conv_queue_t *cq, dl_matrix2d_t* out);
+
+/**
+ * @brief Move the front pointer of queue forward, 
+          the First(oldest) element become the last(newest) element, 
+ *
+ * @param cq    Input convolution queue
+ * @return      Pointer of oldest element  
+ */
+fptp_t *dl_conv_queue_pop(dl_conv_queue_t *cq);
+
+/**
+ * @brief  Remove the oldest element, then insert the input element at the end of queue
+ *
+ * @param cq     Input convolution queue
+ * @param item   The new element
+ */
+void dl_conv_queue_push(dl_conv_queue_t *cq, fptp_t* item);
+
+
+/**
+ * @brief   Get the pointer of element in the queue by offset
+ *
+ * @param cq      Input convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+fptp_t *dl_get_queue_item(dl_conv_queue_t *cq, int offset);
+
+/**
+ * @brief   Does a sigmoid operation on the one of element in the convolution queue.
+ * Gets the pointer of element in the convolution queue by offset, and does a sigmoid operation
+ * by this pointer, then return the pointer      
+ *
+ * @param cq      Input convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+fptp_t *dl_sigmoid_step(dl_conv_queue_t *cq, int offset);
+
+/**
+ * @brief   Does a tanh operation on the one of element in the convolution queue.
+ * Gets the pointer of element in the convolution queue by offset, and does a tanh operation
+ * by this pointer, then return the pointer  
+ *
+ * @param cq      Input convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+fptp_t *dl_tanh_step(dl_conv_queue_t *cq, int offset);
+
+/**
+ * @brief   Does a softmax operation on the one of element in the convolution queue.
+ * Gets the pointer of element in the convolution queue by offset, and does a softmax operation
+ * by this pointer, then return the pointer 
+ *
+ * @param cq      Input convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+fptp_t *dl_softmax_step(dl_conv_queue_t *cq, int offset);
+
+fptp_t *dl_relu_step(dl_conv_queue_t *cq, int offset);
+fptp_t *dl_relu_look(dl_matrix2d_t *cq, int offset);
+dl_matrix2d_t *dl_matrix_concat1(const dl_conv_queue_t *a, const dl_matrix2d_t *b);
+dl_matrix2d_t *dl_basic_lstm_layer1(const dl_conv_queue_t *in, dl_matrix2d_t *state_c, dl_matrix2d_t *state_h,
+                                   const dl_matrix2d_t *weight, const dl_matrix2d_t *bias);
+/**
+ * @brief Fast implement for 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        based on convolution queue.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is first element of output queue and should not be freed separately.
+ *
+ * @param in       Input convolution queue
+ * @param out      Output convolution queue
+ * @param rate     A positive int, the stride with which we sample input value 
+ * @param size     A positive int, the size of 1D-filter
+ * @param kernel   The kernel matrix of filter
+ * @param bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @return         The result of atrous convolution
+ */
+fptp_t *dl_atrous_conv1d_step(dl_conv_queue_t *in, dl_conv_queue_t *out, int rate, int size,
+                              dl_matrix2d_t* kernel, dl_matrix2d_t* bias);
+fptp_t *dl_look_conv_step(dl_conv_queue_t *in, dl_matrix2d_t *out, int rate, int size,
+                         dl_matrix2d_t* kernel, dl_matrix2d_t* bias);
+
+/**
+ * @brief Fast implement of dilation layer as follows
+ *
+ *               |-> [gate(sigmoid)] -|        
+ *      input -  |                    |-> (*) - output
+ *               |-> [filter(tanh)]  -|   
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is first element of output queue and should not be freed separately.
+ *
+ * @param in              Input convolution queue
+ * @param out             Output convolution queue
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param filter_kernel   The kernel matrix of filter
+ * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param gate_kernel     The kernel matrix of gate
+ * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
+ * @return                The result of dilation layer
+ */
+fptp_t *dl_dilation_layer(dl_conv_queue_t *in, dl_conv_queue_t *out, int rate, int size,
+                          dl_matrix2d_t* filter_kernel, dl_matrix2d_t* filter_bias,
+                          dl_matrix2d_t* gate_kernel, dl_matrix2d_t* gate_bias);
+
+
+void test_atrous_conv(int size, int rate, int in_channel, int out_channel);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32c6/dl_lib_convq8_queue.h b/include/esp32c6/dl_lib_convq8_queue.h
new file mode 100644
index 0000000..28c5da7
--- /dev/null
+++ b/include/esp32c6/dl_lib_convq8_queue.h
@@ -0,0 +1,303 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_CONVQ8_QUEUE_H
+#define DL_LIB_CONVQ8_QUEUE_H
+
+
+#include "dl_lib_matrixq.h"
+#include "dl_lib_matrixq8.h"
+#include "dl_lib_conv_queue.h"
+#include "dl_lib_convq_queue.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//[nch, n, c]
+typedef struct {
+    int n;           /*< the length of queue */
+    int c;           /*< the number of queue element*/
+    int front;       /*< the front(top) position of queue */
+    int nch;         /*< the channel of queue */
+    int exponent;    /*< The values in items should be multiplied by pow(2,exponent) 
+                         to get the real values */
+    q8tp_t *itemq;    /*< Pointer to item array */
+} dl_convq8_queue_t;
+
+/**
+ * @brief Allocate a fixed-point convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq8_queue_t *dl_convq8_queue_alloc(int n, int c);
+
+/**
+ * @brief Allocate a fixed-point convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @param c     The channel of queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq8_queue_t *dl_convq8_queue_alloc_mc(int n, int c, int nch);
+
+/**
+ * @brief Allocate a bit fixed-point convolution queue from PSRAM
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @param nch     The channel of queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq8_queue_t *dl_convq8_queue_alloc_mc_from_psram(int n, int c, int nch);
+
+/**
+ * @brief Free a fixed-point convolution queue
+ *
+ * @param cq     The fixed-point convolution queue to free
+ */
+void dl_convq8_queue_free(dl_convq8_queue_t *cq);
+
+/**
+ * @brief Set itemq of convolution queue to 0
+ *
+ * @param cq     The fixed-point convolution queue to free
+ */
+void dl_convq8_queue_bzero(dl_convq8_queue_t *cqm);
+
+/**
+ * @brief Move the front pointer of queue forward, 
+          the First(oldest) element become the last(newest) element, 
+ *
+ * @param cq    Input fixed-point convolution queue
+ * @return      Pointer of oldest element  
+ */
+q8tp_t *dl_convq8_queue_pop(dl_convq8_queue_t *cq);
+q8tp_t *dl_convq8_queue_popn(dl_convq8_queue_t *cq, int n);
+
+/**
+ * @brief  Insert the float-point element at the end of queue.
+ *         The precision of fixed-point numbers is described by the Qm.f notation,  
+ *
+ * @param cq     Input fixed-point convolution queue
+ * @param item   The float-point element
+ * @param m_bit  The number of integer bits including the sign bits
+ * @param f_bit  The number of fractional bits
+ */
+void dl_convq8_queue_push_by_qmf(dl_convq8_queue_t *cq, fptp_t* item, int m_bit, int f_bit);
+
+/**
+ * @brief   Get the pointer of element in the queue by offset
+ *
+ * @param cq      Input fixed-point convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+q8tp_t *dl_get_queue_itemq8(dl_convq8_queue_t *cq, int offset);
+
+/**
+ * @brief   Get the pointer of element in the queue by offset
+ *
+ * @param cq      Input fixed-point convolution queue
+ * @param offset  Offset from the front of the queue
+ * @param ch      Channel index of queue
+ * @return        Pointer of the element
+ */
+q8tp_t *dl_get_queue_itemq8_mc(dl_convq8_queue_t *cq, int offset, int ch);
+
+/**
+ * @brief Fast and quantised implement for 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        based on convolution queue.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param kernel          Kernel matrix of filter
+ * @param bias            The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param out_exponent    Shift ratio used in dot operation between two 16-bit fixed point vector 
+ * @param offset          Offset used to calculate the beginning of input conv queue  
+ * @param prenum          The num to control the parameter size of preload operation  
+ * @return                The result of atrous convolution
+ */
+void dl_atrous_conv1dq8_steps(dl_convq8_queue_t *in, dl_convq8_queue_t *out, int rate, int size,
+                            dl_matrix2dq8_t* kernel, dl_matrix2dq8_t* bias, 
+                            int out_exponent, int offset, int prenum);
+
+/**
+ * @brief Fast implement of dilation layer as follows
+ *
+ *               |-> [gate(sigmoid)] -|        
+ *      input -  |                    |-> (*) - output
+ *               |-> [filter(tanh)]  -|   
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param filter_kernel   The kernel matrix of filter
+ * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param gate_kernel     The kernel matrix of gate
+ * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
+ * @param offset          Offset used to calculate the beginning of input conv queue  
+ * @param prenum          The num to control the parameter size of preload operation  
+ * @return                The result of dilation layer
+ */
+void dl_dilation_layerq8_steps(dl_convq8_queue_t *in, dl_convq8_queue_t *out, int rate, int size,
+                                dl_matrix2dq8_t* filter_kernel, dl_matrix2dq8_t* filter_bias,
+                                dl_matrix2dq8_t* gate_kernel, dl_matrix2dq8_t* gate_bias,
+                                int offset, int prenum);
+
+
+
+
+dl_conv_queue_t *dl_convq8_queue_add(dl_convq8_queue_t *cq1, dl_convq8_queue_t *cq2);
+
+int8_t dl_sigmoid_lutq8(int in);
+/**
+ * @brief Allocate a 8-bit fixed-point Multi-Channel convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @param nch　　The channel number
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq8_queue_t **dl_convq8_queue_mc_alloc(int n, int c, int nch);
+
+/**
+ * @brief Free a 8-bit fixed-point Multi-Channel convolution queue
+ *
+ * @param cqm     The fixed-point convolution queue to free
+ * @param nch     The channel number
+ */
+void dl_convq8_queue_mc_free(dl_convq8_queue_t **cqm, int nch);
+
+/**
+ * @brief Tanh activation function for 8-bit fixed-point Multi-Channel convolution queue input
+ *
+ * @param cqm     Input 8-bit fixed-point Multi-Channel convolution queue
+ * @param offset  Offset used to calculate the beginning of input conv queue 
+ * @param nch     The channel number
+ */
+void dl_tanh_convq8_mc(dl_convq8_queue_t **cqm, int offset, int nch);
+
+/**
+ * @brief Fast and quantised 16-bit implement for Multi-channel 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        Usually, this layer is used as first layer for 8-bit network.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          Input is a 16-bit queue point, Output is an 8-bit queue point.
+ *
+ * @param in              Input 16bit fixed-point convolution queue array
+ * @param out             Output 8bit fixed-point convolution queue array
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param kernel          The kernel matrix of filter
+ * @param bias            The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param out_exponent    Exponent of output
+ * @param offset          Offset used to calculate the beginning of input conv queue  
+ * @param prenum          The num to control the parameter size of preload operation            
+ */
+void dl_atrous_conv1dq8_16in_mc_steps(dl_convq_queue_t **in, dl_convq8_queue_t **out, int nch, int rate, int size,
+                                        dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int out_exponent, int offset, int prenum);
+
+/**
+ * @brief Fast and quantised 8-bit implement for Multi-channel 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        based on convolution queue.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input 8bit fixed-point convolution queue array
+ * @param out             Output 8bit fixed-point convolution queue array
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param kernel          The kernel matrix of filter
+ * @param bias            The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param out_exponent    Exponent of output
+ * @param offset          Offset used to calculate the beginning of input conv queue 
+ * @param prenum          The num to control the parameter size of preload operation            
+ */
+void dl_atrous_conv1dq8_mc_steps(dl_convq8_queue_t **in, dl_convq8_queue_t **out,
+                                int nch, int rate, int size,
+                                dl_matrix2dq8_t* kernel, dl_matrix2dq8_t* bias, 
+                                int out_exponent, int offset, int prenum);
+
+/**
+ * @brief Fast implement of 8-bit dilation layer as follows
+ *
+ *               |-> [gate(sigmoid)] -|        
+ *      input -  |                    |-> (*) - output
+ *               |-> [filter(tanh)]  -|   
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input 8-bit fixed-point convolution queue
+ * @param out             Output 8-bit fixed-point convolution queue
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param filter_kernel   The kernel matrix of filter
+ * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param gate_kernel     The kernel matrix of gate
+ * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
+ * @param offset          Offset used to calculate the beginning of input conv queue 
+ * @param prenum          The num to control the parameter size of preload operation
+ */
+void dl_dilation_layerq8_mc_steps(dl_convq8_queue_t **in, dl_convq8_queue_t **out, int nch, int rate, int size,
+                                    dl_matrix2dq8_t* filter_kernel, dl_matrix2dq8_t* filter_bias,
+                                    dl_matrix2dq8_t* gate_kernel, dl_matrix2dq8_t* gate_bias,
+                                    int offset, int prenum);    
+
+void dl_convq8_queue_mc_bzero(dl_convq8_queue_t **cqm, int nch);
+
+
+
+dl_convq8_queue_t *dl_convq8_queue_alloc_from_psram(int n, int c);
+
+qtp_t *dl_dilation_layerq16_8(dl_convq_queue_t *in, dl_convq8_queue_t *out, int rate, int size,
+                            dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+                            dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias, int prenum);
+
+
+qtp_t *dl_dilation_layerq8(dl_convq8_queue_t *in, dl_convq8_queue_t *out, int rate, int size,
+                            dl_matrix2dq8_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+                            dl_matrix2dq8_t* gate_kernel, dl_matrix2dq_t* gate_bias, int prenum);
+
+dl_matrix2dq8_t *dl_convq8_lstm_layer(const dl_convq8_queue_t *in, dl_convq8_queue_t *out, dl_matrix2dq8_t *state_c,
+                                      dl_matrix2dq8_t *state_h, const dl_matrix2dq8_t *in_weight, const dl_matrix2dq8_t *h_weight,
+                                      const dl_matrix2dq_t *bias, int prenum);
+
+qtp_t *dl_atrous_conv1dq8_16_s3(dl_convq8_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                                 dl_matrix2dq8_t* kernel, dl_matrix2dq_t* bias, int prenum);
+
+void print_convq8(dl_convq8_queue_t *cq, int offset);
+void print_convq(dl_convq_queue_t *cq, int offset);
+void dl_relu_convq8(dl_convq8_queue_t *cq);
+
+void lstmq8_free(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32c6/dl_lib_convq_queue.h b/include/esp32c6/dl_lib_convq_queue.h
new file mode 100644
index 0000000..ff190fe
--- /dev/null
+++ b/include/esp32c6/dl_lib_convq_queue.h
@@ -0,0 +1,382 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_CONVQ_QUEUE_H
+#define DL_LIB_CONVQ_QUEUE_H
+
+#include "dl_lib_matrixq.h"
+#include "dl_lib_conv_queue.h"
+#include "dl_lib.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//fixed-point convolution FIFO queue. 
+//[nch, n, c]
+typedef struct {
+    int n;           /*< the length of queue */
+    int c;           /*< the number of queue element*/
+    int front;       /*< the front(top) position of queue */
+    int nch;         /*< the multiple of queue*/
+    int exponent;    /*< The values in items should be multiplied by pow(2,exponent) 
+                         to get the real values */
+    qtp_t *itemq;    /*< Pointer to item array */
+} dl_convq_queue_t;
+
+/**
+ * @brief Allocate a fixed-point convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq_queue_t *dl_convq_queue_alloc(int n, int c);
+
+/**
+ * @brief Allocate a fixed-point convolution queue from PSRAM
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq_queue_t *dl_convq_queue_alloc_from_psram(int n, int c);
+
+/**
+ * @brief Allocate a fixed-point multi-channel convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @param nch   The channel of conv queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq_queue_t *dl_convq_queue_alloc_mc(int n, int c, int nch);
+
+/**
+ * @brief Allocate a fixed-point multi-channel convolution queue from PSRAM
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @param nch   The channel of conv queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq_queue_t *dl_convq_queue_alloc_mc_from_psram(int n, int c, int nch);
+
+
+void dl_convq_to_matrix2dq(dl_convq_queue_t *cq, dl_matrix2dq_t* out, int row);
+
+/**
+ * @brief Free a fixed-point convolution queue
+ *
+ * @param cq     The fixed-point convolution queue to free
+ */
+void dl_convq_queue_free(dl_convq_queue_t *cq);
+
+/**
+ * @brief Set itemq of convolution queue to 0
+ *
+ * @param cq     The fixed-point convolution queue point
+ */
+void dl_convq_queue_bzero(dl_convq_queue_t *cq);
+
+/**
+ * @brief Move the front pointer of queue forward, 
+          the First(oldest) element become the last(newest) element, 
+ *
+ * @param cq    Input fixed-point convolution queue
+ * @return      Pointer of oldest element  
+ */
+qtp_t *dl_convq_queue_pop(dl_convq_queue_t *cq);
+qtp_t *dl_convq_queue_popn(dl_convq_queue_t *cq, int n);
+/**
+ * @brief  Remove the oldest element, then insert the input element at the end of queue
+ *
+ * @param cq     Input fixed-point convolution queue
+ * @param item   The new element
+ */
+void dl_convq_queue_push(dl_convq_queue_t *cq, dl_matrix2dq_t *a, int shift);
+
+/**
+ * @brief  Insert the float-point element at the end of queue.
+ *         The precision of fixed-point numbers is described by the Qm.f notation,  
+ *
+ * @param cq     Input fixed-point convolution queue
+ * @param item   The float-point element
+ * @param m_bit  The number of integer bits including the sign bits
+ * @param f_bit  The number of fractional bits
+ */
+void dl_convq_queue_push_by_qmf(dl_convq_queue_t *cq, fptp_t* item, int m_bit, int f_bit);
+
+void dl_convq16_queue_push_by_qmf(dl_convq_queue_t *cq, fptp_t* item, int m_bit, int f_bit);
+
+dl_conv_queue_t *dl_queue_from_convq(dl_convq_queue_t *cq1);
+
+/**
+ * @brief   Get the pointer of element in the queue by offset
+ *
+ * @param cq        Input fixed-point convolution queue
+ * @param last_num  Offset from the front of the queue
+ * @return          Pointer of the element
+ */
+qtp_t *dl_get_queue_itemq(dl_convq_queue_t *cq, int last_num);
+
+/**
+ * @brief   Get the pointer of element in the queue by offset
+ *
+ * @param cq        Input fixed-point convolution queue
+ * @param offset    Offset from the front of the queue
+ * @param ch        Channel index of convolution queue 
+ * @return          Pointer of the element
+ */
+qtp_t *dl_get_queue_itemq_mc(dl_convq_queue_t *cq, int offset, int ch);
+
+/**
+ * @brief   Does a tanh operation on the one of element in the convolution queue.
+ *          Gets the pointer of element in the convolution queue by offset, and does a 
+ *          tanh operation by this pointer, then return the pointer 
+ *
+ * @param cq      Input fixed-point convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+void dl_tanh_convq(dl_convq_queue_t *cq, int offset);
+
+/**
+ * @brief   Does a tanh operation on the one of element in multi channel convolution queue.
+ *          Gets the pointer of element in the convolution queue by offset, and does a 
+ *          tanh operation by this pointer, then return the pointer 
+ *
+ * @param cq      Input fixed-point multi channnel convolution queue
+ * @param offset  Offset from the front of the queue
+ * @param nch     The channel number of cqm
+ * @return        Pointer of the element
+ */
+void dl_tanh_convq_mc(dl_convq_queue_t **cqm, int offset, int nch);
+
+/**
+ * @brief   Does a relu operation on the one of element in the convolution queue.
+ *          Gets the pointer of element in the convolution queue by offset, and does a 
+ *          relu operation by this pointer, then return the pointer 
+ *
+ * @param cq      Input fixed-point convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+void dl_relu_convq(dl_convq_queue_t *cq, fptp_t clip, int last_num);
+
+/**
+ * @brief   Does a softmax operation on the one of element in the convolution queue.
+ *          Gets the pointer of element in the convolution queue by offset, input data
+            stay as it is. Results are saved into the *out* array. 
+ *
+ * @param cq      Input fixed-point convolution queue
+ * @param offset  Offset from the front of the queue
+ * @param out     Old array to re-use. Passing NULL will allocate a new matrix.
+ * @return        softmax results
+ */
+fptp_t * dl_softmax_step_q(dl_convq_queue_t *cq, int offset, fptp_t *out);
+
+/**
+ * @brief Fast and quantised implement for 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        based on convolution queue.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in       Input fixed-point convolution queue
+ * @param out      Output fixed-point convolution queue
+ * @param rate     A positive int, the stride with which we sample input value 
+ * @param size     A positive int, the size of 1D-filter
+ * @param kernel   The kernel matrix of filter
+ * @param bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param shift    Shift ratio used in dot operation between two 16-bit fixed point vector 
+ * @return         The result of atrous convolution
+ */
+qtp_t * dl_atrous_conv1dq(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                          dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int shift, int prenum);
+
+/**
+ * @brief Fast implement of dilation layer as follows
+ *
+ *               |-> [gate(sigmoid)] -|        
+ *      input -  |                    |-> (*) - output
+ *               |-> [filter(tanh)]  -|   
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param filter_kernel   The kernel matrix of filter
+ * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param gate_kernel     The kernel matrix of gate
+ * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
+ * @param filter_shift          Shift ratio used in filter operation between two 16-bit fixed point vector
+ * @param gate_shift            Shift ratio used in gate operation between two 16-bit fixed point vector
+ * @return                The result of dilation layer
+ */
+qtp_t *dl_dilation_layerq_steps(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+   dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+   dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias,
+   int filter_shift, int gate_shift, int offset, int prenum);
+
+
+qtp_t *dl_dilation_layerq(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                          dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+                          dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias,
+                          int filter_shift, int gate_shift, int prenum);
+
+qtp_t *dl_dilation_layerq16(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                             dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+                             dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias, int prenum);
+
+
+qtp_t *dl_atrous_conv1dq_steps(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                                 dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int shift, int offset, int prenum);
+
+/**
+ * @brief Add a pair of fixed-point convolution queue item-by-item, and return float-point convolution queue
+ *
+ * @param cq1      First fixed-point convolution queue
+ * @param cq2      Seconf fixed-point convolution queue
+ * @return         The result of float-point convolution queue
+ */
+dl_conv_queue_t *dl_convq_queue_add(dl_convq_queue_t *cq1, dl_convq_queue_t *cq2);
+
+/**
+ * @brief Fast implement of LSTM layer by dl_atrous_conv1dq function
+ *
+ * @Warning LSTM kernel is split into two part, the first part input is the last layer output, 
+ *           and kernel is parameter *in_weight*. The second part input is the last frame LSTM output,
+ *           the kernel is parameters *h_weight*.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param state_c         Internal state of the LSTM network
+ * @param state_h         Internal state (previous output values) of the LSTM network
+ * @param in_weight       the LSTM kernel needed by first part
+ * @param h_weight        the LSTM kernel needed by second part
+ * @param bias            The bias matrix of LSTM. Can be NULL if a bias of 0 is required.
+ * @in_shift              Shift ratio used in first part
+ * @h_shift               Shift ratio used in second part
+ * @return                The result of LSTM layer
+ */
+dl_matrix2dq_t *dl_convq_lstm_layer(const dl_convq_queue_t *in, dl_convq_queue_t *out, dl_matrix2dq_t *state_c,
+                                    dl_matrix2dq_t *state_h, const dl_matrix2dq_t *in_weight, const dl_matrix2dq_t *h_weight,
+                                    const dl_matrix2dq_t *bias, int in_shift, int h_shift, int prenum);
+dl_matrix2dq_t *dl_basic_lstm_layer1_q(const dl_convq_queue_t *in, dl_matrix2dq_t *state_c, dl_matrix2dq_t *state_h,
+                                       const dl_matrix2dq_t *weight, const dl_matrix2dq_t *bias, int step, int shift);
+
+dl_matrix2dq_t *dl_convq16_lstm_layer(dl_convq_queue_t *in, dl_convq_queue_t *out, dl_matrix2dq_t *state_c,
+                                       dl_matrix2dq_t *state_h, dl_matrix2dq_t *in_weight, dl_matrix2dq_t *h_weight,
+                                       dl_matrix2dq_t *bias, int prenum);
+
+/**
+ * @brief Allocate a fixed-point multi channel convolution queue 
+ *
+ * @param n     The length of queue
+ * @param c     The channel number of elements in the queue
+ * @param nch   the channel numbet of convolution queue 
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq_queue_t **dl_convq_queue_mc_alloc(int n, int c, int nch);
+
+/**
+ * @brief Free a fixed-point multi channel convolution queue
+ *
+ * @param cqm     The fixed-point convolution queue to free
+ * @param nch     The channel number of cqm
+ */
+void dl_convq_queue_mc_free(dl_convq_queue_t **cqm, int nch);
+
+/**
+ * @brief Fast and quantised implement for 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        based on convolution queue.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in       Input fixed-point convolution queue
+ * @param out      Output fixed-point convolution queue
+ * @param nch      The channel number of input 
+ * @param rate     A positive int, the stride with which we sample input value 
+ * @param size     A positive int, the size of 1D-filter
+ * @param kernel   The kernel matrix of filter
+ * @param bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param shift    Shift ratio used in dot operation between two 16-bit fixed point vector 
+ * @param offset   the offset to calculate input convq
+ * @param prenum   the preload size, 0: do not use preload function
+ * @return         The result of atrous convolution
+ */
+qtp_t *dl_atrous_conv1dq_mc_steps(  dl_convq_queue_t **in,
+                                    dl_convq_queue_t **out,
+									int nch,
+									int rate,
+									int size,
+                                    dl_matrix2dq_t* kernel,
+									dl_matrix2dq_t* bias,
+									int shift,
+									int offset,
+									int prenum);
+
+/**
+ * @brief Fast implement of dilation layer as follows for multi channel input
+ *
+ *               |-> [gate(sigmoid)] -|        
+ *      input -  |                    |-> (*) - output
+ *               |-> [filter(tanh)]  -|   
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param nch             The channel number of input 
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param filter_kernel   The kernel matrix of filter
+ * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param gate_kernel     The kernel matrix of gate
+ * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
+ * @param filter_shift    Shift ratio used in filter operation between two 16-bit fixed point vector
+ * @param gate_shift      Shift ratio used in gate operation between two 16-bit fixed point vector
+ * @param offset          The offset to calculate input convq
+ * @param prenum          The preload size, 0: do not use preload function
+ * @return                The result of dilation layer
+ */
+qtp_t *dl_dilation_layerq_mc_steps( dl_convq_queue_t **in, 
+									dl_convq_queue_t **out,
+									int nch,
+									int rate,
+									int size,
+                                    dl_matrix2dq_t* filter_kernel,
+									dl_matrix2dq_t* filter_bias,
+                                    dl_matrix2dq_t* gate_kernel,
+									dl_matrix2dq_t* gate_bias,
+                                    int filter_shift,
+									int gate_shift,
+									int offset,
+									int prenum);
+
+void test_atrous_convq(int size, int rate, int in_channel, int out_channel);
+void test_lstm_convq(int size, int in_dim, int lstm_cell);
+void dl_nn_tanh_i162(dl_convq_queue_t **cqm, int offset, int nch);
+void dl_copy_queue_item_by_qmf(dl_convq_queue_t *cq, fptp_t* item, int m_bit, int f_bit, int offset, int ch);
+void dl_convq_queue_mc_bzero(dl_convq_queue_t **cqm, int nch);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32c6/dl_lib_matrix.h b/include/esp32c6/dl_lib_matrix.h
new file mode 100644
index 0000000..59f7d79
--- /dev/null
+++ b/include/esp32c6/dl_lib_matrix.h
@@ -0,0 +1,257 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_MATRIX_H
+#define DL_LIB_MATRIX_H
+
+#ifdef ESP_PLATFORM
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+#include "freertos/queue.h"
+#include "esp_system.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef float fptp_t;
+
+#if CONFIG_BT_SHARE_MEM_REUSE
+extern multi_heap_handle_t gst_heap;
+#endif
+
+//Flags for matrices
+#define DL_MF_FOREIGNDATA 1  /*< Matrix pointer and item data actually points to another matrix and should not be freed */
+#define DL_MF_FOREIGNITEM 2  /*< Only item data actually points to another matrix and should not be freed */
+
+//'Normal' float matrix
+typedef struct {
+    int w;          /*< Width */
+    int h;          /*< Height */
+    int stride;     /*< Row stride, essentially how many items to skip to get to the same position in the next row */
+    int flags;      /*< Flags. OR of DL_MF_* values */
+    fptp_t *item;   /*< Pointer to item array */
+} dl_matrix2d_t;
+
+//Macro to quickly access the raw items in a matrix
+#define DL_ITM(m, x, y) m->item[(x)+(y)*m->stride]
+
+
+/**
+ * @brief Allocate a matrix
+ *
+ * @param w     Width of the matrix
+ * @param h     Height of the matrix
+ * @return The matrix, or NULL if out of memory
+ */
+dl_matrix2d_t *dl_matrix_alloc(int w, int h);
+
+
+/**
+ * @brief Free a matrix
+ * Frees the matrix structure and (if it doesn't have the DL_MF_FOREIGNDATA flag set) the m->items space as well.
+ *
+ * @param m     Matrix to free
+ */
+void dl_matrix_free(dl_matrix2d_t *m);
+
+/**
+ * @brief Zero out the matrix
+ * Sets all entries in the matrix to 0.
+ *
+ * @param m     Matrix to zero
+ */
+void dl_matrix_zero(dl_matrix2d_t *m);
+
+/**
+ * @brief Copy the matrix into psram
+ * Copy the matrix from flash or iram/psram into psram
+ *
+ * @param m     Matrix to zero
+ */
+dl_matrix2d_t *dl_matrix_copy_to_psram(const dl_matrix2d_t *m);
+
+/**
+ * @brief Generate a new matrix using a range of items from an existing matrix.
+ * When using this, the data of the new matrix is not allocated/copied but it re-uses a pointer
+ * to the existing data. Changing the data in the resulting matrix, as a result, will also change
+ * the data in the existing matrix that has been sliced.
+ *
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @param in    Old matrix (with foreign data) to re-use. Passing NULL will allocate a new matrix.
+ * @return The resulting slice matrix, or NULL if out of memory
+ */
+dl_matrix2d_t *dl_matrix_slice(const dl_matrix2d_t *src, int x, int y, int w, int h, dl_matrix2d_t *in);
+
+/**
+ * @brief select a range of items from an existing matrix and flatten them into one dimension.
+ *
+ * @Warning The results are flattened in row-major order.
+ *   
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @param in    Old matrix to re-use. Passing NULL will allocate a new matrix.
+ * @return  The resulting flatten matrix, or NULL if out of memory
+ */
+dl_matrix2d_t *dl_matrix_flatten(const dl_matrix2d_t *src, int x, int y, int w, int h, dl_matrix2d_t *in);
+
+/**
+ * @brief Generate a matrix from existing floating-point data
+ *
+ * @param w     Width of resulting matrix
+ * @param h     Height of resulting matrix
+ * @param data  Data to populate matrix with
+ * @return A newaly allocated matrix populated with the given input data, or NULL if out of memory.
+ */
+dl_matrix2d_t *dl_matrix_from_data(int w, int h, int stride, const void *data);
+
+
+/**
+ * @brief Multiply a pair of matrices item-by-item: res=a*b
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Multiplicated data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrix_mul(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *res);
+
+/**
+ * @brief Do a dotproduct of two matrices : res=a.b
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ */
+void dl_matrix_dot(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *res);
+
+/**
+ * @brief Add a pair of matrices item-by-item: res=a-b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Added data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrix_add(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *out);
+
+
+/**
+ * @brief Divide a pair of matrices item-by-item: res=a/b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Divided data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrix_div(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *out);
+
+/**
+ * @brief Subtract a matrix from another, item-by-item: res=a-b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Subtracted data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrix_sub(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *out);
+
+/**
+ * @brief Add a constant to every item of the matrix
+ *
+ * @param subj  Matrix to add the constant to
+ * @param add   The constant
+ */
+void dl_matrix_add_const(dl_matrix2d_t *subj, const fptp_t add);
+
+
+/**
+ * @brief Concatenate the rows of two matrices into a new matrix
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @return A newly allocated array with as avlues a|b
+ */
+dl_matrix2d_t *dl_matrix_concat(const dl_matrix2d_t *a, const dl_matrix2d_t *b);
+
+dl_matrix2d_t *dl_matrix_concat_h( dl_matrix2d_t *a, const dl_matrix2d_t *b);
+
+/**
+ * @brief Print the contents of a matrix to stdout. Used for debugging.
+ *
+ * @param a     The matrix to print.
+ */
+void dl_printmatrix(const dl_matrix2d_t *a);
+
+/**
+ * @brief Return the average square error given a correct and a test matrix.
+ *
+ * ...Well, more or less. If anything, it gives an indication of the error between
+ * the two. Check the code for the exact implementation.
+ *
+ * @param a     First of the two matrices to compare
+ * @param b     Second of the two matrices to compare
+ * @return value indicating the relative difference between matrices
+ */
+float dl_matrix_get_avg_sq_err(const dl_matrix2d_t *a, const dl_matrix2d_t *b);
+
+
+
+/**
+ * @brief Check if two matrices have the same shape, that is, the same amount of rows and columns
+ *
+ * @param a     First of the two matrices to compare
+ * @param b     Second of the two matrices to compare
+ * @return true if the two matrices are shaped the same, false otherwise.
+ */
+int dl_matrix_same_shape(const dl_matrix2d_t *a, const dl_matrix2d_t *b);
+
+
+/**
+ * @brief Get a specific item from the matrix
+ *
+ * Please use these for external matrix access instead of DL_ITM
+ *
+ * @param m     Matrix to access
+ * @param x     Column address
+ * @param y     Row address
+ * @return Value in that position
+ */
+inline static fptp_t dl_matrix_get(const dl_matrix2d_t *m, const int x, const int y) { 
+    return DL_ITM(m, x, y);
+}
+
+/**
+ * @brief Set a specific item in the matrix to the given value
+ *
+ * Please use these for external matrix access instead of DL_ITM
+ *
+ * @param m     Matrix to access
+ * @param x     Column address
+ * @param y     Row address
+ * @param val   Value to write to that position
+ */
+inline static void dl_matrix_set(dl_matrix2d_t *m, const int x, const int y, fptp_t val) { 
+    DL_ITM(m, x, y)=val;
+}
+
+void matrix_get_range(const dl_matrix2d_t *m, fptp_t *rmin, fptp_t *rmax);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/include/esp32c6/dl_lib_matrixq.h b/include/esp32c6/dl_lib_matrixq.h
new file mode 100644
index 0000000..8ad397b
--- /dev/null
+++ b/include/esp32c6/dl_lib_matrixq.h
@@ -0,0 +1,387 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_MATRIXQ_H
+#define DL_LIB_MATRIXQ_H
+
+#include <stdint.h>
+#include "dl_lib_matrix.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int16_t qtp_t;
+
+//Quantized matrix. Uses fixed numbers and has the storage for the rows/columns inverted 
+//for easy use as a multiplicand without stressing out the flash cache too much.
+typedef struct {
+    int w;
+    int h;
+    int stride; //Normally equals h, not w!
+    int flags;
+    int exponent; //The values in items should be multiplied by pow(2,exponent) to get the real values.
+    qtp_t *itemq;
+} dl_matrix2dq_t;
+
+#define DL_QTP_SHIFT 15
+#define DL_QTP_RANGE ((1<<DL_QTP_SHIFT)-1)
+#define DL_ITMQ(m, x, y) m->itemq[(y)+(x)*m->stride]
+#define DL_QTP_EXP_NA 255 //non-applicable exponent because matrix is null
+
+#define DL_SHIFT_AUTO 32
+
+/**
+ * @info About quantized matrices and shift values
+ *
+ * Grab a coffee (or tea, or hot water)  and sit down when you read this for the first 
+ * time. Quantized matrices can speed up your operations, but come with some quirks, and
+ * it's good to understand how they work before using them.
+ *
+ * The data in the quantized matrix type is stored similarily to floating-point types:
+ * when storing a real value, the value is stored as a mantissa (base number) and an
+ * exponent. The 'real' value that can be re-derived from those two numbers is something
+ * similar to mantissa*2^exponent. Up to this point, there's not that much difference from 
+ * the standard floating point implementations like e.g. IEEE-754.
+ *
+ * The difference with respect to quantized matrices is that for a quantized matrix, it is 
+ * assumed all values stored have more-or-less the same order of magnitude. This allows the
+ * matrix to only store all the mantissas, while the exponents are shared; there is only one 
+ * exponent for the entire matrix. This makes it quicker to handle matrix operations - the
+ * logic to fix the exponents only needs to happen once, while the rest can be done in simple
+ * integer arithmetic. It also nets us some memory savings - while normally a floating point
+ * number is 32-bit, storing only 16-bit mantissas as the matrix items almost halves the 
+ * memory requirements.
+ *
+ * While most of the details of handling the intricacies of the quantized matrixes are done
+ * transparently by the code in dl_lib_matrixq.c, some implementation details leak out, 
+ * specifically in places where addition/subtraction/division happens.
+ *
+ * The problem is that the routines do not know what the size of the resulting operation is. For
+ * instance, when adding two matrices of numbers, the resulting numbers *could* be large enough
+ * to overflow the mantissa of the result if the exponent is the same. However, if by default we
+ * assume the mantissas needs to be scaled back, we may lose precision.
+ *
+ * In order to counter this, all operations that have this issue have a ``shift`` argument. If 
+ * the argument is zero, the routine will be conservative, that is, increase the exponent of 
+ * the result to such an extent it's mathematically impossible a value in the result will exceed
+ * the maximum value that can be stored. However, when this argument is larger than zero, the
+ * algorithm will hold back on this scaling by the indicated amount of bits, preserving precision
+ * but increasing the chance of some of the calculated values not fitting in the mantissa anymore.
+ * If this happens, the value will be clipped to the largest (or, for negative values, smallest)
+ * value possible. (Neural networks usually are okay with this happening for a limited amount
+ * of matrix indices).
+ *
+ * For deciding on these shift values, it is recommended to start with a shift value of one, then
+ * use dl_matrixq_check_sanity on the result. If this indicates clipping, lower the shift value. 
+ * If it indicates bits are under-used, increase it. Note that for adding and subtraction, only
+ * shift values of 0 or 1 make sense; these routines will error out if you try to do something
+ * else.
+ *
+ * For neural networks and other noise-tolerant applications, note that even when 
+ * dl_matrixq_check_sanity does not indicate any problems, twiddling with the shift value may lead
+ * to slightly improved precision. Feel free to experiment.
+ **/
+
+
+/**
+ * @brief Allocate a matrix
+ *
+ * @param w     Width of the matrix
+ * @param h     Height of the matrix
+ * @return The matrix, or NULL if out of memory
+ */
+dl_matrix2dq_t *dl_matrixq_alloc(int w, int h);
+dl_matrix2dq_t *dl_matrixq_alloc_psram(int w, int h);
+/**
+ * @brief Convert a floating-point matrix to a quantized matrix
+ *
+ * @param m     Floating-point matrix to convert
+ * @param out   Quantized matrix to re-use. If NULL, allocate a new one.
+ * @Return The quantized version of the floating-point matrix
+ */
+dl_matrix2dq_t *dl_matrixq_from_matrix2d(const dl_matrix2d_t *m, dl_matrix2dq_t *out);
+
+/**
+ * TODO: DESCRIBE THIS FUNCTION
+ */
+dl_matrix2dq_t *dl_matrixq_from_matrix2d_by_qmf(const dl_matrix2d_t *m, dl_matrix2dq_t *out, int m_bit, int f_bit);
+
+
+/**
+ * @brief Convert a quantized matrix to a floating-point one.
+ *
+ * @param m     Floating-point matrix to convert
+ * @param out   Quantized matrix to re-use. If NULL, allocate a new one.
+ * @Return The quantized version of the floating-point matrix
+ **/
+dl_matrix2d_t *dl_matrix2d_from_matrixq(const dl_matrix2dq_t *m, dl_matrix2d_t *out);
+
+
+/**
+ * @brief Free a quantized matrix
+ * Frees the matrix structure and (if it doesn't have the DL_MF_FOREIGNDATA flag set) the m->items space as well.
+ *
+ * @param m     Matrix to free
+ */
+void dl_matrixq_free(dl_matrix2dq_t *m);
+
+/**
+ * @brief Zero out the matrix
+ * Sets all entries in the matrix to 0.
+ *
+ * @param m     Matrix to zero
+ */
+void dl_matrixq_zero(dl_matrix2dq_t *m);
+
+/**
+ * @brief Copy the matrix into psram
+ * Copy the matrix from flash or iram/psram into psram
+ *
+ * @param m     Matrix to copy
+ */
+dl_matrix2dq_t *dl_matrixq_copy_to_psram(const dl_matrix2dq_t *m);
+
+/**
+ * @brief Do a dotproduct of two quantized matrices : res=a.b, Result is a fixed-point matrix.
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ * @param shift Shift ratio
+ */
+void dl_matrixq_dot(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res, int shift);
+
+/**
+ * @brief Do a dotproduct of two quantized matrices: res=a.b, Result is a floating-point matrix.
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ */
+void dl_matrixq_dot_matrix_out(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2d_t *res);
+
+/**
+ * @brief Do a dotproduct of two quantized matrices : res=a.b. This always uses the simple & stupid C algo for the dot product.
+ *
+ * Result is a fixed-point matrix. 
+ *
+ * Use this only if you expect something is wrong with the accelerated routines that dl_matrixq_dot calls; this function can be
+ * much slower than dl_matrixq_dot .
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ * @param shift Shift ratio
+ */
+void dl_matrixq_dot_c_impl(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res, int shift);
+
+/**
+ * @brief Do a dotproduct of two quantized matrices : res=a.b. This always uses the simple & stupid C algo for the dot product. 
+ *
+ * Result is a floating-point matrix. 
+ *
+ * Use this only if you expect something is wrong with the accelerated routines that dl_matrixq_dot_matrix_out calls; this function can be
+ * much slower than dl_matrixq_dot_matrix_out.
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ */
+void dl_matrixq_dot_matrix_out_c_impl(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2d_t *res);
+
+/**
+ * @brief Do a dotproduct of a floating point and a quantized matrix. Result is a floating-point matrix.
+ *
+ * @param a     First multiplicand; float matrix
+ * @param b     Second multiplicand; quantized matrix
+ * @param res   Dotproduct data; float matrix. *Must* be a *different* matrix from a or b!
+ */
+void dl_matrix_matrixq_dot(const dl_matrix2d_t *a, const dl_matrix2dq_t *b, dl_matrix2d_t *res);
+
+
+/**
+ * @brief Print the contents of a quantized matrix to stdout. Used for debugging.
+ *
+ * @param a     The matrix to print.
+ */
+void dl_printmatrixq(const dl_matrix2dq_t *a);
+
+
+/**
+ * @brief Add a pair of quantizedmatrices item-by-item: res=a-b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Added data. Can be equal to a or b to overwrite that.
+ * @param shift Shift value. Only 0 or 1 makes sense here. <ToDo: check>
+ */
+void dl_matrixq_add(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res, int shift);
+
+/**
+ * @brief Generate a new matrix using a range of items from an existing matrix.
+ * When using this, the data of the new matrix is not allocated/copied but it re-uses a pointer
+ * to the existing data. Changing the data in the resulting matrix, as a result, will also change
+ * the data in the existing matrix that has been sliced.
+ *
+ * @Warning In contrast to the floating point equivalent of this function, the fixed-point version
+ * of this has the issue that as soon as the output exponent of one of the slices changes, the data
+ * in the sliced matrix gets corrupted (because the exponent of that matrix is still the same.) If you
+ * use this function, either treat the slices as read-only, or assume the sliced matrix contains
+ * garbage after modifying the data in one of the slices.
+ *
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @param in    Old matrix (with foreign data) to re-use. Passing NULL will allocate a new matrix.
+ * @return The resulting slice matrix, or NULL if out of memory
+ */
+dl_matrix2dq_t *dl_matrixq_slice(const dl_matrix2dq_t *src, int x, int y, int w, int h, dl_matrix2dq_t *in);
+
+/**
+ * @brief select a range of items from an existing matrix and flatten them into one dimension.
+ *
+ * @Warning The results are flattened in row-major order.
+ *   
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @param in    Old matrix to re-use. Passing NULL will allocate a new matrix.
+ * @return The resulting flatten matrix, or NULL if out of memory
+ */
+dl_matrix2dq_t *dl_matrixq_flatten(const dl_matrix2dq_t *src, int x, int y, int w, int h, dl_matrix2dq_t *in);
+
+/**
+ * @brief Subtract a quantized matrix from another, item-by-item: res=a-b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Subtracted data. Can be equal to a or b to overwrite that.
+ * @param shift Shift value. Only 0 or 1 makes sense here. <ToDo: check>
+ */
+void dl_matrixq_sub(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res, int shift);
+
+/**
+ * @brief Multiply a pair of quantized matrices item-by-item: res=a*b
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Multiplicated data. Can be equal to a or b to overwrite that matrix.
+ */
+void dl_matrixq_mul( dl_matrix2dq_t *a,  dl_matrix2dq_t *b, dl_matrix2dq_t *res);
+
+/**
+ * @brief Divide a pair of quantized matrices item-by-item: res=a/b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Divided data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrixq_div(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *out, int shift);
+
+/**
+ * @brief Check if two quantized matrices have the same shape, that is, the same amount of 
+ * rows and columns
+ *
+ * @param a     First of the two matrices to compare
+ * @param b     Second of the two matrices to compare
+ * @return true if the two matrices are shaped the same, false otherwise.
+ */
+int dl_matrixq_same_shape(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b);
+
+/**
+ * @brief Concatenate the rows of two quantized matrices into a new matrix
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @return A newly allocated quantized matrix with as values a|b
+ */
+dl_matrix2dq_t *dl_matrixq_concat(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b);
+
+/**
+ * @brief Add a constant to every item of the quantized matrix
+ *
+ * @param subj  Matrix to add the constant to
+ * @param add   The constant
+ */
+void dl_matrixq_add_const(dl_matrix2dq_t *subj, const fptp_t add, int shift);
+
+/**
+ * @brief Check the sanity of a quantized matrix
+ *
+ * Due to the nature of quantized matrices, depending on the calculations a quantized
+ * matrix is the result of and the shift values chosen in those calculations, a quantized
+ * matrix may have an exponent and mantissas that lead to a loss of precision, either because
+ * most significant mantissa bits are unused, or because a fair amount of mantissas are 
+ * clipped. This function checks if this is the case and will report a message to stdout
+ * if significant loss of precision is detected.
+ *
+ * @param m     The quantized matrix to check
+ * @param name  A string to be displayed in the message if the sanity check fails
+ * @return True if matrix is sane, false otherwise
+ **/
+
+int dl_matrixq_check_sanity(dl_matrix2dq_t *m, const char *name);
+
+/**
+ * @brief re-adjust the exponent of the matrix to fit the mantissa better
+ *
+ * This function will shift up all the data in the mantissas so there are no
+ * most-significant bits that are unused in all mantissas. It will also adjust
+ * the exponent to keep the actua values in the matrix the same.
+ *
+ * Some operations done on a matrix, especially operations that re-use the
+ * result of earlier operations done in the same way, can lead to the loss of
+ * data because the exponent of the quantized matrix is never re-adjusted. You
+ * can do that implicitely by calling this function.
+ *
+ * @param m     The matrix to re-adjust
+**/
+void dl_matrixq_readjust_exp(dl_matrix2dq_t *m);
+
+
+
+/**
+ * @brief Get the floating-point value of a specific item from the quantized matrix
+ *
+ * @param m     Matrix to access
+ * @param x     Column address
+ * @param y     Row address
+ * @return Value in that position
+ */
+fptp_t dl_matrixq_get(const dl_matrix2dq_t *m, const int x, const int y);
+
+/**
+ * @brief Set a specific item in the quantized matrix to the given 
+ * floating-point value
+ *
+ * @warning If the given value is more than the exponent in the quantized matrix
+ * allows for, all mantissas in the matrix will be shifted down to make the value
+ * 'fit'. If, however, the exponent is such that the value would result in a
+ * quantized mantissa of 0, nothing is done.
+ *
+ * @param m     Matrix to access
+ * @param x     Column address
+ * @param y     Row address
+ * @param val   Value to write to that position
+ */
+void dl_matrixq_set(dl_matrix2dq_t *m, const int x, const int y, fptp_t val);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32c6/dl_lib_matrixq8.h b/include/esp32c6/dl_lib_matrixq8.h
new file mode 100644
index 0000000..377df7c
--- /dev/null
+++ b/include/esp32c6/dl_lib_matrixq8.h
@@ -0,0 +1,80 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_MATRIXQ8_H
+#define DL_LIB_MATRIXQ8_H
+
+#include <stdint.h>
+#include "dl_lib_matrix.h"
+#include "dl_lib.h"
+#include "dl_lib_matrixq.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int8_t q8tp_t;
+
+typedef struct {
+    int w;
+    int h;
+    int stride; //Normally equals h, not w!
+    int flags;
+    int exponent; //The values in items should be multiplied by pow(2,exponent) to get the real values.
+    q8tp_t *itemq;
+} dl_matrix2dq8_t;
+
+#define DL_Q8TP_SHIFT 7
+#define DL_Q8TP_RANGE ((1<<DL_Q8TP_SHIFT)-1)
+#define DL_ITMQ8(m, x, y) m->itemq[(y)+(x)*m->stride]
+
+/**
+ * @brief Allocate a matrix
+ *
+ * @param w     Width of the matrix
+ * @param h     Height of the matrix
+ * @return The matrix, or NULL if out of memory
+ */
+dl_matrix2dq8_t *dl_matrixq8_alloc(int w, int h);
+
+/**
+ * @brief Free a quantized matrix
+ * Frees the matrix structure and (if it doesn't have the DL_MF_FOREIGNDATA flag set) the m->items space as well.
+ *
+ * @param m     Matrix to free
+ */
+void dl_matrixq8_free(dl_matrix2dq8_t *m);
+
+/**
+ * @brief Copy a quantized matrix
+ * Copy a quantized matrix from flash or iram/psram
+ *
+ * @param m     Matrix to copy
+ */
+dl_matrix2dq8_t *dl_matrixq8_copy_to_psram(const dl_matrix2dq8_t *m);
+
+/**
+ * @brief Convert a floating-point matrix to a quantized matrix
+ *
+ * @param m     Floating-point matrix to convert
+ * @param out   Quantized matrix to re-use. If NULL, allocate a new one.
+ * @Return The quantized version of the floating-point matrix
+ */
+dl_matrix2dq8_t *dl_matrixq8_from_matrix2d(const dl_matrix2d_t *m, dl_matrix2dq8_t *out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/include/esp32c6/esp_aec.h b/include/esp32c6/esp_aec.h
new file mode 100644
index 0000000..36de9c1
--- /dev/null
+++ b/include/esp32c6/esp_aec.h
@@ -0,0 +1,105 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_AEC_H_
+#define _ESP_AEC_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define USE_AEC_FFT                      // Not kiss_fft
+#define AEC_SAMPLE_RATE     16000        // Only Support 16000Hz
+#define AEC_FRAME_LENGTH_MS 32
+
+typedef struct aec_handle_t aec_handle_t;
+typedef enum {
+    AEC_MODE_SR_LOW_COST = 0,     // Low Cost AEC fro speech recognition
+    AEC_MODE_SR_HIGH_PERF = 1,    // High Perforamce AEC for speech recognition
+    AEC_MODE_VOIP_LOW_COST = 3,   // Low Cost AEC for voice communication
+    AEC_MODE_VOIP_HIGH_PERF = 4,  // High Perforamce AEC for voice communication
+} aec_mode_t;
+
+/**
+ * @brief Creates an instance to the AEC structure.
+ * Please get frame size by aec_get_chunksize() function
+ * 
+ * @param sample_rate       The Sampling frequency (Hz) must be 16000.
+ * @param filter_length     Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
+ * @param channel_num       The input microphone channel number
+ * @param mode              The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of AEC
+ */
+aec_handle_t *aec_create(int sample_rate, int filter_length, int channel_num, aec_mode_t mode);
+
+/**
+ * @brief Creates an instance to the AEC structure, same with aec_create().
+ * 
+ * @param filter_length     Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
+ * @param channel_num       The input microphone channel number
+ * @param mode              The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of AEC
+ */
+aec_handle_t *aec_pro_create(int filter_length, int channel_num, aec_mode_t mode);
+
+/**
+ * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
+ *
+ * @warning The indata, refdata and outdata must be 16-bit signed. please allocate memory by heap_caps_aligned_alloc().
+ * 
+ * @param inst        The instance of AEC. Format for multi-channel data is "ch0 ch0 ch0 ..., ch1 ch1 ch1 ..."
+ * @param indata      An array of 16-bit signed audio samples from mic.
+ * @param refdata     An array of 16-bit signed audio samples sent to the speaker.
+ * @param outdata     Returns near-end signal with echo removed. Format for multi-channel data is "ch0 ch0 ch0..., ch1 ch1 ch1 ..."
+ * @return None
+ *
+ */
+void aec_process(const aec_handle_t *handel, int16_t *indata, int16_t *refdata, int16_t *outdata);
+
+/**
+ * @brief Get frame size of AEC (the samples of one frame)
+ * @param handle The instance of AEC.
+ * @return Frame size
+ */
+int aec_get_chunksize(const aec_handle_t *handle);
+
+/**
+ * @brief Get AEC mode string 
+ * 
+ * @param aec_mode  The mode of AEC.
+ * 
+ * @return AEC mode string
+ */
+char * aec_get_mode_string(aec_mode_t aec_mode);
+
+/**
+ * @brief Free the AEC instance
+ *
+ * @param inst The instance of AEC.
+ *
+ * @return None
+ *
+ */
+void aec_destroy(aec_handle_t *handel);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_AEC_H_
diff --git a/include/esp32c6/esp_afe_aec.h b/include/esp32c6/esp_afe_aec.h
new file mode 100644
index 0000000..9d60588
--- /dev/null
+++ b/include/esp32c6/esp_afe_aec.h
@@ -0,0 +1,82 @@
+
+#ifndef _ESP_AFE_AEC_H_
+#define _ESP_AFE_AEC_H_
+
+
+#include "esp_afe_config.h"
+#include "esp_aec.h"
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    aec_handle_t* handle;
+    aec_mode_t mode;
+    afe_pcm_config_t pcm_config;
+    int frame_size;
+    int16_t  *data;
+}afe_aec_handle_t;
+
+
+/**
+ * @brief Creates an instance to the AEC structure. 
+ * 
+ * @warning Currently only support 1 microphone channel and 1 playback channe. 
+ * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
+ *
+ * The input format, same as afe config:
+ * M to represent the microphone channel
+ * R to represent the playback reference channel
+ * N to represent an unknown or unused channel
+ *
+ * For example, input_format="MMNR" indicates that the input data consists of four channels,
+ * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
+ *
+ * @param input_format     The input format
+ * @param filter_length    The length of filter. The larger the filter, the higher the CPU loading.
+ *                         Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
+ * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
+ * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+ *
+ * @return afe_config_t*  The default config of afe
+ */
+afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);
+
+
+/**
+ * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
+ * 
+ * @param inst        The instance of AEC.
+ * @param indata      Input audio data, format is define by input_format. Note indata will be modified in function call.
+ * @param outdata     Returns near-end signal with echo removed. 
+
+ * @return The bytes of outdata.
+ */
+size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
+
+/**
+ * @brief Get frame size of AEC (the samples of one frame)
+ * @param handle The instance of AEC.
+ * @return Frame size
+ */
+int afe_aec_get_chunksize(afe_aec_handle_t *handle);
+
+
+/**
+ * @brief Free the AEC instance
+ *
+ * @param inst The instance of AEC.
+ *
+ * @return None
+ *
+ */
+void afe_aec_destroy(afe_aec_handle_t *handel);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_AEC_H_
diff --git a/include/esp32c6/esp_afe_config.h b/include/esp32c6/esp_afe_config.h
new file mode 100644
index 0000000..f9de6fe
--- /dev/null
+++ b/include/esp32c6/esp_afe_config.h
@@ -0,0 +1,69 @@
+#pragma once
+#include "esp_aec.h"
+#include "stdbool.h"
+#include "stdint.h"
+#include "stdlib.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// AFE: Audio Front-End
+// SR:  Speech Recognition
+// VC:  Voice Communication
+
+// Set AFE_SR mode
+typedef enum {
+    SR_MODE_LOW_COST = 0,  // Deprecated, please use afe_mode_t, AFE mode: low cost mode
+    SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode
+} afe_sr_mode_t;
+
+// Set AFE mode
+typedef enum {
+    AFE_MODE_LOW_COST = 0,  // AFE mode: low cost mode
+    AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
+} afe_mode_t;
+
+// Set AFE type
+typedef enum {
+    AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
+    AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
+} afe_type_t;
+
+typedef enum {
+    AFE_MEMORY_ALLOC_MORE_INTERNAL = 1,          // malloc with more internal ram
+    AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
+    AFE_MEMORY_ALLOC_MORE_PSRAM = 3              // malloc with more psram
+} afe_memory_alloc_mode_t;
+
+typedef enum {
+    AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
+    AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
+    AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
+    AFE_MN_PEAK_NO_AGC = 0,      // There is no agc gain
+} afe_mn_peak_agc_mode_t;
+
+typedef struct {
+    int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
+    int mic_num;      // microphone channel number
+    uint8_t *mic_ids; // microphone channel indices
+    int ref_num;      // playback reference channel number
+    uint8_t *ref_ids; // playback reference channel indices
+    int sample_rate;  // sample rate of audio
+} afe_pcm_config_t;
+
+typedef enum {
+    AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
+    AFE_NS_MODE_NET = 1,    // please use model name of NSNET
+} afe_ns_mode_t;
+
+typedef enum {
+    AFE_AGC_MODE_WEBRTC = 0,  // WEBRTC AGC
+    AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
+} afe_agc_mode_t;
+
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/include/esp32c6/esp_mfcc_fbank_int16.h b/include/esp32c6/esp_mfcc_fbank_int16.h
new file mode 100644
index 0000000..22a5f2c
--- /dev/null
+++ b/include/esp32c6/esp_mfcc_fbank_int16.h
@@ -0,0 +1,86 @@
+#pragma once
+#include "esp_speech_features.h"
+#include <stdint.h>
+
+/*
+This describes an interface for a MFCC runner, that is, some kind of implementation that can be
+fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so
+multiple implementations can be used.
+*/
+
+typedef struct esp_mfcc_data_t esp_mfcc_data_t;
+
+// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features),
+// please refer to its documentation for details.
+typedef struct {
+    int winstep_ms; // The step between successive windows in ms. (10)
+    int winlen_ms;  // The length of the analysis window in ms. (25)
+    int nch;        // The number of input channel
+    int numcep;     // The number of cepstrum to return
+    int nfilter;    // The number of filters in the filterbank
+    int nfft;       // The FFT size
+    int samp_freq;  // The sample-rate of the signal.
+    int low_freq;   // The lowest band edge of mel filters, in hz. (e.g. 0)
+    int high_freq;  // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
+    float preemph;  // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
+    char *win_type; // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
+    bool append_energy; // 　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
+    bool use_power;     // If true, use power of fft spectrum, else use magnitude of fft spectrum
+    int use_log_fbank;  // 0: return fbank, 1:  return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
+    float log_epsilon;  // log epsilon. (e.g. 1e-7)
+    bool psram_first;   // Alloc memory from PSRAM first
+    bool remove_dc_offset; // Whether to subtract mean of wave before FFT
+} esp_mfcc_opts_t;
+
+/**
+ * @brief Un-initialize and free a mfcc runner
+ *
+ * Function to free a previously allocated mfcc runner.
+ *
+ * @param r Runner object to destroy
+ */
+typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Initialize parameters for a mfcc runner.
+ *
+ * After creation, a mfcc runner needs to be initialized first; this is usually done
+ * in the initialization routine of a speech recognition algorithm. This provides
+ * a pointer to do this for a specific mfcc runner.
+ *
+ * @param opt Options for the mfcc process
+ * @return True if success, false on error.
+ */
+typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
+
+/**
+ * @brief Run a mfcc iteration on frame by frame
+ *
+ * This will take a set of samples and return a ceptrum. Note that this may be pipelined:
+ * an initial call to this function may return NULL and subsequent calls may return the
+ * cepstrum of previous calls.
+ *
+ * @param r The mfcc runner
+ * @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000).
+ * @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function
+ *         when done with this buffer. Note that some implementations require the buffer to be freed before another call
+ *         to this function is done.
+ */
+typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
+
+/**
+ * @brief Clean all state of mfcc handle
+ *
+ * @param r The mfcc runner
+ */
+typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Operations possible on a mfcc runner
+ */
+typedef struct {
+    esp_mfcc_op_destroy_t destroy;
+    esp_mfcc_op_create_t create;
+    esp_mfcc_op_run_step_t run_step;
+    esp_mfcc_op_clean_t clean;
+} esp_mfcc_iface_t;
diff --git a/include/esp32c6/esp_mfcc_iface.h b/include/esp32c6/esp_mfcc_iface.h
new file mode 100644
index 0000000..0257768
--- /dev/null
+++ b/include/esp32c6/esp_mfcc_iface.h
@@ -0,0 +1,89 @@
+#pragma once
+#include "esp_speech_features.h"
+#include <stdint.h>
+
+/*
+This describes an interface for a MFCC runner, that is, some kind of implementation that can be
+fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so
+multiple implementations can be used.
+*/
+
+typedef struct esp_mfcc_data_t esp_mfcc_data_t;
+
+// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features),
+// please refer to its documentation for details.
+typedef struct {
+    int winstep_ms; // The step between successive windows in ms. (10)
+    int winlen_ms;  // The length of the analysis window in ms. (25)
+    int nch;        // The number of input channel
+    int numcep;     // The number of cepstrum to return
+    int nfilter;    // The number of filters in the filterbank
+    int nfft;       // The FFT size
+    int samp_freq;  // The sample-rate of the signal.
+    int low_freq;   // The lowest band edge of mel filters, in hz. (e.g. 0)
+    int high_freq;  // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
+    float preemph;  // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
+    char *win_type; // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
+    bool append_energy; // 　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
+    bool use_power;     // If true, use power of fft spectrum, else use magnitude of fft spectrum
+    int use_log_fbank;  // 0: return fbank, 1:  return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
+    float log_epsilon;  // log epsilon. (e.g. 1e-7)
+    bool psram_first;   // Alloc memory from PSRAM first
+    bool remove_dc_offset; // Whether to subtract mean of wave before FFT
+} esp_mfcc_opts_t;
+
+/**
+ * @brief Un-initialize and free a mfcc runner
+ *
+ * Function to free a previously allocated mfcc runner.
+ *
+ * @param r Runner object to destroy
+ */
+typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Initialize parameters for a mfcc runner.
+ *
+ * After creation, a mfcc runner needs to be initialized first; this is usually done
+ * in the initialization routine of a speech recognition algorithm. This provides
+ * a pointer to do this for a specific mfcc runner.
+ *
+ * @param opt Options for the mfcc process
+ * @return True if success, false on error.
+ */
+typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
+
+/**
+ * @brief Run a mfcc iteration on frame by frame
+ *
+ * This will take a set of samples and return a ceptrum. Note that this may be pipelined:
+ * an initial call to this function may return NULL and subsequent calls may return the
+ * cepstrum of previous calls.
+ *
+ * @param r The mfcc runner
+ * @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000).
+ * @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function
+ *         when done with this buffer. Note that some implementations require the buffer to be freed before another call
+ *         to this function is done.
+ */
+typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
+
+typedef void (*esp_mfcc_op_run_step_s16_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t *fbank);
+
+/**
+ * @brief Clean all state of mfcc handle
+ *
+ * @param r The mfcc runner
+ */
+typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Operations possible on a mfcc runner
+ */
+typedef struct {
+    esp_mfcc_op_destroy_t destroy;
+    esp_mfcc_op_create_t create;
+    esp_mfcc_op_run_step_t run_step;
+    esp_mfcc_op_run_step_s16_t run_step_s16;
+    esp_mfcc_op_clean_t clean;
+} esp_mfcc_iface_t;
diff --git a/include/esp32c6/esp_mfcc_models.h b/include/esp32c6/esp_mfcc_models.h
new file mode 100644
index 0000000..44086e8
--- /dev/null
+++ b/include/esp32c6/esp_mfcc_models.h
@@ -0,0 +1,44 @@
+#pragma once
+#include "esp_mfcc_iface.h"
+
+extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle
+extern const esp_mfcc_iface_t esp_fbank_s16; // int16-fbank handle
+
+/**
+ * @brief Return basic opts used in wakenet9 & multinet5
+ **/
+esp_mfcc_opts_t *get_mfcc_opts_wn9();
+
+/**
+ * @brief Return basic opts used in wakenet9s
+ **/
+esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
+
+/**
+ * @brief Return basic opts for default kaldifeat
+ *
+    opts->psram_first = true;
+    opts->use_power = true;
+    opts->use_log_fbank = 2;  // log(max(x, log_epsilon))
+    opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps
+    opts->win_type = "povey";
+    opts->low_freq = 20;
+    opts->high_freq = 7600;
+    opts->samp_freq = 16000;
+    opts->nch = 1;
+    opts->nfft = 512;
+    opts->nfilter = 80;
+    opts->numcep = 80;
+    opts->preemph = 0.97;
+    opts->append_energy = false;
+    opts->winlen_ms = 25;
+    opts->winstep_ms = 10;
+    opts->remove_dc_offset = true;
+ *
+ **/
+esp_mfcc_opts_t *get_mfcc_opts_kaldi();
+
+/**
+ * @brief Print mfcc opts
+ **/
+void print_mfcc_opts(esp_mfcc_opts_t *opts);
diff --git a/include/esp32c6/esp_speech_features.h b/include/esp32c6/esp_speech_features.h
new file mode 100644
index 0000000..c1659f9
--- /dev/null
+++ b/include/esp32c6/esp_speech_features.h
@@ -0,0 +1,62 @@
+#pragma once
+#include "c_speech_features_config.h"
+#include "stdlib.h"
+#include <assert.h>
+#include <stdbool.h>
+
+#ifndef M_2PI
+#define M_2PI 6.283185307179586476925286766559005
+#endif
+
+typedef struct {
+    float *coeff;
+    int *bank_pos;
+    int nfilter;
+} esp_mel_filter_t;
+
+float *esp_mfcc_malloc(size_t size, bool from_psram);
+
+void esp_mfcc_free(void *ptr);
+
+/**
+ * @brief Initialize FFT table
+ * @warning For ESP-PLATFORM, use esp-dsp fft
+ *          For Other platform, use kiss fft
+ *
+ * @param nfft  The input samples number
+ * @return fft-table
+ **/
+void *esp_fft_init(int nfft);
+
+/**
+ * @brief Free FFT table
+ * @warning For ESP-PLATFORM, use esp-dsp fft
+ *          For Other platform, use kiss fft
+ *
+ * @param fft_table  The fft table initialized by esp_fft_init
+ * @param nfft       The input samples number
+ * @return fft-table
+ **/
+void esp_fft_deinit(void *fft_table, int nfft);
+
+/**
+ * @brief Initial window function
+ *        Currently support hanning, hamming, sine, povey, rectangular,
+ *        wn9(512-hanning to get wakenet9& multinet5 compatible)
+ **/
+float *esp_win_func_init(char *win_type, float *window_data, int frame_length);
+
+float *esp_fftr(float *x, int nfft, void *fft_table);
+
+float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
+
+void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
+
+float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);
+
+esp_mel_filter_t *esp_mel_filter_init(
+    int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, bool from_psram);
+
+void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);
+
+float *esp_mel_dotprod_step(float *x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, float epsilon);
diff --git a/include/esp32c6/esp_wn_iface.h b/include/esp32c6/esp_wn_iface.h
new file mode 100644
index 0000000..44bab8d
--- /dev/null
+++ b/include/esp32c6/esp_wn_iface.h
@@ -0,0 +1,215 @@
+#pragma once
+#include "stdint.h"
+#include "dl_lib_convq_queue.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//Opaque model data container
+typedef struct model_iface_data_t model_iface_data_t;
+
+/**
+ * @brief The state of wakeup
+ */
+typedef enum
+{
+    WAKENET_NO_DETECT = 0,               // wake word is not detected
+    WAKENET_CHANNEL_VERIFIED = -1,       // output channel is verified
+    WAKENET_DETECTED = 1                 // wake word is detected
+} wakenet_state_t;
+
+//Set wake words recognition operating mode
+//The probability of being wake words is increased with increasing mode, 
+//As a consequence also the false alarm rate goes up
+typedef enum {
+	DET_MODE_90 = 0,       // Normal
+	DET_MODE_95 = 1,       // Aggressive
+    DET_MODE_2CH_90 = 2,
+    DET_MODE_2CH_95 = 3,
+    DET_MODE_3CH_90 = 4,
+    DET_MODE_3CH_95 = 5,
+} det_mode_t;
+
+typedef struct {
+    int wake_word_num;     //The number of all wake words
+    char **wake_word_list; //The name list of wake words  
+} wake_word_info_t;
+
+/**
+ * @brief Easy function type to initialze a model instance with a detection mode and specified wake word coefficient
+ *
+ * @param model_name  The specified wake word model coefficient
+ * @param det_mode    The wake words detection mode to trigger wake words, DET_MODE_90 or DET_MODE_95
+ * @returns Handle to the model data
+ */
+typedef model_iface_data_t* (*esp_wn_iface_op_create_t)(const void *model_name, det_mode_t det_mode);
+
+/**
+ * @brief Get the amount of samples that need to be passed to the detect function
+ *
+ * Every speech recognition model processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_wn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the channel number of samples that need to be passed to the detect function
+ *
+ * Every speech recognition model processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_wn_iface_op_get_channel_num_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the start point of wake word when one wake word is detected. 
+ * 
+ * @Warning: This function should be called when the channel index is verified. 
+ * The returned value is the number of samples from start point of wake word to detected point. 
+ * 
+ * @param model The model object to query
+ * @return The number of samples from start point to detected point (end point)
+ */
+typedef int (*esp_wn_iface_op_get_start_point_t)(model_iface_data_t *model);
+
+
+/**
+ * @brief Get the sample rate of the samples to feed to the detect function
+ *
+ * @param model The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_wn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the number of wake words
+ *
+ * @param model The model object to query
+ * @returns the number of wake words
+ */
+typedef int (*esp_wn_iface_op_get_word_num_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the name of wake word by index
+ *
+ * @Warning The index of wake word start with 1
+
+ * @param model The model object to query
+ * @param word_index The index of wake word
+ * @returns the detection threshold
+ */
+typedef char* (*esp_wn_iface_op_get_word_name_t)(model_iface_data_t *model, int word_index);
+
+/**
+ * @brief Set the detection threshold to manually abjust the probability 
+ *
+ * @param model The model object to query
+ * @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.5~0.9999
+ * @param word_index The index of wake word
+ * @return 0: setting failed, 1: setting success
+ */
+typedef int (*esp_wn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold, int word_index);
+
+/**
+ * @brief Get the wake word detection threshold of different modes
+ *
+ * @param model The model object to query
+ * @param word_index The index of wake word
+ * @returns the detection threshold
+ */
+typedef float (*esp_wn_iface_op_get_det_threshold_t)(model_iface_data_t *model, int word_index);
+
+/**
+ * @brief Feed samples of an audio stream to the keyword detection model and detect if there is a keyword found.
+ *
+ * @Warning The index of wake word start with 1, 0 means no wake words is detected.
+ *
+ * @param model The model object to query
+ * @param samples An array of 16-bit signed audio samples. The array size used can be queried by the 
+ *        get_samp_chunksize function.
+ * @return The index of wake words, return 0 if no wake word is detected, else the index of the wake words.
+ */
+typedef wakenet_state_t (*esp_wn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
+
+/**
+ * @brief Get the volume gain
+ *
+ * @param model The model object to query
+ * @param target_db  The target dB to calculate volume gain
+ * @returns the volume gain
+ */
+typedef float (*esp_wn_iface_op_get_vol_gain_t)(model_iface_data_t *model, float target_db);
+
+/**
+ * @brief Get the triggered channel index. Channel index starts from zero
+ *
+ * @param model The model object to query
+ * @return The channel index
+ */
+typedef int (*esp_wn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
+
+/**
+ * @brief Clean all states of model
+ *
+ * @param model The model object to query
+ */
+typedef void (*esp_wn_iface_op_clean_t)(model_iface_data_t *model);
+
+/**
+ * @brief Destroy a speech recognition model
+ *
+ * @param model Model object to destroy
+ */
+typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model);
+
+/**
+ * @brief Feed MFCC of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param cq An array of 16-bit MFCC.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef wakenet_state_t (*esp_wn_iface_op_detect_mfcc_t)(model_iface_data_t *model, int16_t *samples, dl_convq_queue_t *cq);
+
+/**
+ * @brief Get MFCC of an audio stream
+ *
+ * @param model The model object to query
+ * @return MFCC data
+ */
+typedef dl_convq_queue_t* (*esp_wn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
+
+
+/**
+ * This structure contains the functions used to do operations on a wake word detection model.
+ */
+typedef struct {
+    esp_wn_iface_op_create_t create;
+    esp_wn_iface_op_get_start_point_t get_start_point;
+    esp_wn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_wn_iface_op_get_channel_num_t get_channel_num;
+    esp_wn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_wn_iface_op_get_word_num_t get_word_num;
+    esp_wn_iface_op_get_word_name_t get_word_name;
+    esp_wn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_wn_iface_op_get_det_threshold_t get_det_threshold;
+    esp_wn_iface_op_get_triggered_channel_t  get_triggered_channel;
+    esp_wn_iface_op_get_vol_gain_t get_vol_gain;
+    esp_wn_iface_op_detect_t detect;
+    esp_wn_iface_op_detect_mfcc_t detect_mfcc;
+    esp_wn_iface_op_get_mfcc_data_t get_mfcc_data;
+    esp_wn_iface_op_clean_t clean;
+    esp_wn_iface_op_destroy_t destroy;
+} esp_wn_iface_t;
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/include/esp32c6/esp_wn_models.h b/include/esp32c6/esp_wn_models.h
new file mode 100644
index 0000000..3a4d7e4
--- /dev/null
+++ b/include/esp32c6/esp_wn_models.h
@@ -0,0 +1,52 @@
+#pragma once
+#include "esp_wn_iface.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The prefix of wakenet model name is used to filter all wakenet from availabel models.
+#define ESP_WN_PREFIX "wn"
+
+/**
+ * @brief Get the wakenet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of wakenet
+ */
+const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
+
+/**
+ * @brief Get the wake word name from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The wake word name, like "alexa","hilexin","xiaoaitongxue"
+ */
+char* esp_wn_wakeword_from_name(const char *model_name);
+
+#ifdef __cplusplus
+}
+#endif
+
+/*
+
+static const sr_model_iface_t *model = esp_wn_handle_from_name(model_name);
+
+//Initialize wakeNet model data
+static model_iface_data_t *model_data=model->create(model_name, DET_MODE_90);
+
+//Set parameters of buffer
+int audio_chunksize=model->get_samp_chunksize(model_data);
+int frequency = model->get_samp_rate(model_data);
+int16_t *buffer=malloc(audio_chunksize*sizeof(int16_t));
+
+//Detect
+int r=model->detect(model_data, buffer);
+if (r>0) {
+    printf("Detection triggered output %d.\n",  r);
+}
+
+//Destroy model
+model->destroy(model_data)
+
+*/
diff --git a/include/esp32s2/c_speech_features_config.h b/include/esp32s2/c_speech_features_config.h
new file mode 100644
index 0000000..e21e020
--- /dev/null
+++ b/include/esp32s2/c_speech_features_config.h
@@ -0,0 +1,29 @@
+#pragma once
+#include <float.h>
+#include <math.h>
+
+/* #undef ENABLE_DOUBLE */
+
+#ifdef ENABLE_DOUBLE
+# define csf_float double
+# define csf_ceil ceil
+# define csf_floor floor
+# define csf_sin sin
+# define csf_log log
+# define csf_log10 log10
+# define csf_pow pow
+# define csf_sqrt sqrt
+# define csf_abs fabs
+# define csf_float_min DBL_MIN
+#else
+# define csf_float float
+# define csf_ceil ceilf
+# define csf_floor floorf
+# define csf_sin sinf
+# define csf_log logf
+# define csf_log10 log10f
+# define csf_pow powf
+# define csf_sqrt sqrtf
+# define csf_abs fabsf
+# define csf_float_min FLT_MIN
+#endif
diff --git a/include/esp32s2/dl_lib.h b/include/esp32s2/dl_lib.h
new file mode 100644
index 0000000..47e7c86
--- /dev/null
+++ b/include/esp32s2/dl_lib.h
@@ -0,0 +1,418 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_H
+#define DL_LIB_H
+
+#include "dl_lib_matrix.h"
+#include "dl_lib_matrixq.h"
+#include "dl_lib_matrixq8.h"
+
+#ifdef ESP_PLATFORM
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+#include "freertos/queue.h"
+#include "esp_system.h"
+#include "esp_heap_caps.h"
+#include "sdkconfig.h"
+#define DL_SPIRAM_SUPPORT 1
+#endif
+
+#ifdef CONFIG_IDF_TARGET_ESP32S3
+#include "esp32s3/rom/cache.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int padding_state;
+
+// /**
+//  * @brief Allocate a chunk of memory which has the given capabilities.
+//  *        Equivalent semantics to libc malloc(), for capability-aware memory.
+//  *        In IDF, malloc(p) is equivalent to heap_caps_malloc(p, MALLOC_CAP_8BIT).
+//  * 
+//  * @param size  In bytes, of the amount of memory to allocate
+//  * @param caps  Bitwise OR of MALLOC_CAP_* flags indicating the type of memory to be returned
+//  *              MALLOC_CAP_SPIRAM:   Memory must be in SPI RAM
+//  *              MALLOC_CAP_INTERNAL: Memory must be internal; specifically it should not disappear when flash/spiram cache is switched off
+//  *              MALLOC_CAP_DMA:      Memory must be able to accessed by DMA
+//  *              MALLOC_CAP_DEFAULT:  Memory can be returned in a non-capability-specific memory allocation
+//  * @return Pointer to currently allocated heap memory
+//  **/
+// void *heap_caps_malloc(size_t size, uint32_t caps);
+
+/**
+ * @brief Allocate aligned memory from internal memory or external memory.
+ *        if cnt*size > CONFIG_SPIRAM_MALLOC_ALWAYSINTERNAL, allocate memory from internal RAM
+ *        else, allocate memory from PSRAM
+ *
+ * @param cnt    Number of continuing chunks of memory to allocate
+ * @param size   Size, in bytes, of a chunk of memory to allocate     
+ * @param align  Aligned size, in bits
+ * @return Pointer to currently allocated heap memory
+ */
+void *dl_lib_calloc(int cnt, int size, int align);
+
+/**
+ * @brief Always allocate aligned memory from external memory.
+ *
+ * @param cnt    Number of continuing chunks of memory to allocate
+ * @param size   Size, in bytes, of a chunk of memory to allocate     
+ * @param align  Aligned size, in bits
+ * @return Pointer to currently aligned heap memory
+ */
+void *dl_lib_calloc_psram(int cnt, int size, int align);
+
+/**
+ * @brief Free aligned memory allocated by `dl_lib_calloc` or `dl_lib_calloc_psram` 
+ * 
+ * @param ptr    Pointer to free
+ */
+void dl_lib_free(void *ptr);
+
+/**
+ * @brief Does a fast version of the exp() operation on a floating point number.
+ *
+ * As described in https://codingforspeed.com/using-faster-exponential-approximation/
+ * Should be good til an input of 5 or so with a steps factor of 8.
+ *
+ * @param in Floating point input
+ * @param steps Approximation steps. More is more precise. 8 or 10 should be good enough for most purposes.
+ * @return Exp()'ed output
+ */
+fptp_t fast_exp(double x, int steps);
+
+/**
+ * @brief Does a fast version of the exp() operation on a floating point number.
+ *
+ * @param in Floating point input
+ * @return Exp()'ed output
+ */
+double fast_exp_pro(double x);
+
+/**
+ * @brief Does a softmax operation on a matrix.
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_softmax(const dl_matrix2d_t *in, dl_matrix2d_t *out);
+
+
+/**
+ * @brief Does a softmax operation on a quantized matrix.
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_softmax_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+/**
+ * @brief Does a sigmoid operation on a floating point number
+ *
+ * @param in Floating point input
+ * @return Sigmoid output
+ */
+
+fptp_t dl_sigmoid_op(fptp_t in);
+
+
+/**
+ * @brief Does a sigmoid operation on a matrix.
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_sigmoid(const dl_matrix2d_t *in, dl_matrix2d_t *out);
+
+/**
+ * @brief Does a tanh operation on a floating point number
+ *
+ * @param in        Floating point input number
+ * @return Tanh value
+ */
+fptp_t dl_tanh_op(fptp_t v);
+
+/**
+ * @brief Does a tanh operation on a matrix.
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_tanh(const dl_matrix2d_t *in, dl_matrix2d_t *out);
+
+
+/**
+ * @brief Does a relu (Rectifier Linear Unit) operation on a floating point number
+ *
+ * @param in        Floating point input
+ * @param clip      If value is higher than this, it will be clipped to this value
+ * @return Relu output
+ */
+fptp_t dl_relu_op(fptp_t in, fptp_t clip);
+
+/**
+ * @brief Does a ReLu operation on a matrix.
+ *
+ * @param in        Input matrix
+ * @param clip      If values are higher than this, they will be clipped to this value
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_relu(const dl_matrix2d_t *in, fptp_t clip, dl_matrix2d_t *out);
+
+/**
+ * @brief Fully connected layer operation
+ *
+ * @param in        Input vector
+ * @param weight    Weights of the neurons
+ * @param bias      Biases for the neurons. Can be NULL if a bias of 0 is required.
+ * @param out       Output array. Outputs are placed here. Needs to be an initialized, weight->w by in->h in size, matrix.
+ */
+void dl_fully_connect_layer(const dl_matrix2d_t *in, const dl_matrix2d_t *weight, const dl_matrix2d_t *bias, dl_matrix2d_t *out);
+
+/**
+ * @brief Pre-calculate the sqrtvari variable for the batch_normalize function.
+ * The sqrtvari matrix depends on the variance and epsilon values, which normally are constant. Hence,
+ * this matrix only needs to be calculated once. This function does that.
+ *
+ * @param 
+ * @return
+ */
+void dl_batch_normalize_get_sqrtvar(const dl_matrix2d_t *variance, fptp_t epsilon, dl_matrix2d_t *out);
+
+/**
+ * @brief Batch-normalize a matrix
+ *
+ * @param m         The matrix to normalize
+ * @param offset    Offset matrix
+ * @param scale     Scale matrix
+ * @param mean      Mean matrix
+ * @param sqrtvari  Matrix precalculated using dl_batch_normalize_get_sqrtvar
+ * @return
+ */
+void dl_batch_normalize(dl_matrix2d_t *m, const dl_matrix2d_t *offset, const dl_matrix2d_t *scale, 
+                        const dl_matrix2d_t *mean, const dl_matrix2d_t *sqrtvari);
+
+/**
+ * @brief Do a basic LSTM layer pass.
+ *
+ * @warning Returns state_h pointer, so do not free result.
+
+ * @param in        Input vector
+ * @param state_c   Internal state of the LSTM network
+ * @param state_h   Internal state (previous output values) of the LSTM network
+ * @param weights   Weights for the neurons
+ * @param bias      Bias for the neurons. Can be NULL if no bias is required
+ * @return          Output values of the neurons
+ */
+dl_matrix2d_t *dl_basic_lstm_layer(const dl_matrix2d_t *in, dl_matrix2d_t *state_c, dl_matrix2d_t *state_h, 
+                const dl_matrix2d_t *weight, const dl_matrix2d_t *bias);
+
+/**
+ * @brief Do a basic LSTM layer pass, partial quantized version.
+ * This LSTM function accepts 16-bit fixed-point weights and 32-bit float-point bias. 
+ *
+ * @warning Returns state_h pointer, so do not free result.
+
+ * @param in		Input vector
+ * @param state_c	Internal state of the LSTM network
+ * @param state_h	Internal state (previous output values) of the LSTM network
+ * @param weights	Weights for the neurons, need to be quantised 
+ * @param bias		Bias for the neurons. Can be NULL if no bias is required
+ * @return			Output values of the neurons
+ */
+dl_matrix2dq_t *dl_basic_lstm_layer_quantised_weights(const dl_matrix2d_t *in, dl_matrix2d_t *state_c, dl_matrix2d_t *state_h,
+				const dl_matrix2dq_t *weight, const dl_matrix2d_t *bias);
+
+/**
+ * @brief Do a fully-connected layer pass, fully-quantized version.
+ *
+ * @param in        Input vector
+ * @param weight    Weights of the neurons
+ * @param bias      Bias values of the neurons. Can be NULL if no bias is needed.
+ * @param shift     Number of bits to shift the result back by. See dl_lib_matrixq.h for more info
+ * @return          Output values of the neurons
+ */
+void dl_fully_connect_layer_q(const dl_matrix2dq_t *in, const dl_matrix2dq_t *weight, const dl_matrix2dq_t *bias, dl_matrix2dq_t *out, int shift);
+
+/**
+ * @brief Do a basic LSTM layer pass, fully-quantized version
+ *
+ * @warning Returns state_h pointer, so do not free result.
+
+ * @param in        Input vector
+ * @param state_c   Internal state of the LSTM network
+ * @param state_h   Internal state (previous output values) of the LSTM network
+ * @param weights   Weights for the neurons
+ * @param bias      Bias for the neurons. Can be NULL if no bias is required
+ * @param shift     Number of bits to shift the result back by. See dl_lib_matrixq.h for more info
+ * @return          Output values of the neurons
+ */
+dl_matrix2dq_t *dl_basic_lstm_layer_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *state_c, dl_matrix2dq_t *state_h,
+                const dl_matrix2dq_t *weight, const dl_matrix2dq_t *bias, int shift);
+
+/**
+ * @brief Batch-normalize a matrix, fully-quantized version
+ *
+ * @param m         The matrix to normalize
+ * @param offset    Offset matrix
+ * @param scale     Scale matrix
+ * @param mean      Mean matrix
+ * @param sqrtvari  Matrix precalculated using dl_batch_normalize_get_sqrtvar
+ * @param shift     Number of bits to shift the result back by. See dl_lib_matrixq.h for more info
+ * @return
+ */
+void dl_batch_normalize_q(dl_matrix2dq_t *m, const dl_matrix2dq_t *offset, const dl_matrix2dq_t *scale, 
+                        const dl_matrix2dq_t *mean, const dl_matrix2dq_t *sqrtvari, int shift);
+
+/**
+ * @brief Does a relu (Rectifier Linear Unit) operation on a fixed-point number
+ * This accepts and returns fixed-point 32-bit number with the last 15 bits being the bits after the decimal
+ * point. (Equivalent to a mantissa in a quantized matrix with exponent -15.)
+ *
+ * @param in        Fixed-point input
+ * @param clip      If value is higher than this, it will be clipped to this value
+ * @return Relu output
+ */
+qtp_t dl_relu_q_op(qtp_t in, qtp_t clip);
+
+/**
+ * @brief Does a ReLu operation on a matrix, quantized version
+ *
+ * @param in        Input matrix
+ * @param clip      If values are higher than this, they will be clipped to this value
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_relu_q(const dl_matrix2dq_t *in, fptp_t clip, dl_matrix2dq_t *out);
+
+/**
+ * @brief Does a sigmoid operation on a fixed-point number.
+ * This accepts and returns a fixed-point 32-bit number with the last 15 bits being the bits after the decimal
+ * point. (Equivalent to a mantissa in a quantized matrix with exponent -15.)
+ *
+ * @param in Fixed-point input
+ * @return Sigmoid output
+ */
+int dl_sigmoid_op_q(const int in);
+int16_t dl_sigmoid_op_q8(const int16_t in);
+/**
+ * @brief Does a sigmoid operation on a matrix, quantized version
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_sigmoid_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+/**
+ * @brief Does a tanh operation on a matrix, quantized version
+ *
+ * @param in        Input matrix
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_tanh_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+/**
+ * @brief Does a tanh operation on a fixed-point number.
+ * This accepts and returns a fixed-point 32-bit number with the last 15 bits being the bits after the decimal
+ * point. (Equivalent to a mantissa in a quantized matrix with exponent -15.)
+ *
+ * @param in Fixed-point input
+ * @return tanh output
+ */
+int dl_tanh_op_q(int v);
+int16_t dl_tanh_op_q8(int16_t v);
+
+void load_mat_psram_mn4(void);
+void load_mat_psram_mn3(void);
+void free_mat_psram_mn4(void);
+void free_mat_psram_mn3(void);
+qtp_t dl_hard_sigmoid_op(qtp_t in, int exponent);
+qtp_t dl_hard_tanh_op(qtp_t in, int exponent);
+
+int16_t dl_table_tanh_op(int16_t in, int exponent);
+int16_t dl_table_sigmoid_op(int16_t in, int exponent);
+
+void dl_hard_sigmoid_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+void dl_hard_tanh_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+void dl_table_sigmoid_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+void dl_table_tanh_q(const dl_matrix2dq_t *in, dl_matrix2dq_t *out);
+
+
+/**
+ * @brief Filter out the number greater than clip in the matrix, quantized version
+ *
+ * @param in        Input matrix
+ * @param clip      If values are higher than this, they will be clipped to this value
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_minimum(const dl_matrix2d_t *in, fptp_t clip, dl_matrix2d_t *out);
+
+/**
+ * @brief Filter out the number greater than clip in the matrix, float version
+ *
+ * @param in        Input matrix
+ * @param clip      If values are higher than this, they will be clipped to this value
+ * @param out       Output matrix. Can be the same as the input matrix; if so, output results overwrite the input.
+ */
+void dl_minimum_q(const dl_matrix2dq_t *in, fptp_t clip, dl_matrix2dq_t *out);
+/**
+ * @brief Do a basic CNN layer pass.
+ *
+ * @Warning This just supports the single channel input image, and the output is single row matrix.
+            That is to say, the height of output is 1, and the weight of output is out_channels*out_image_width*out_image_height
+ *
+ * @param in             Input single channel image 
+ * @param weight         Weights of the neurons, weight->w = out_channels, weight->h = filter_width*filter_height
+ * @param bias           Bias for the CNN layer.
+ * @param filter_height  The height of convolution kernel
+ * @param filter_width   The width of convolution kernel
+ * @param out_channels   The number of output channels of convolution kernel
+ * @param stride_x       The step length of the convolution window in x(width) direction
+ * @param stride_y       The step length of the convolution window in y(height) direction
+ * @param pad            One of `"VALID"` or `"SAME"`, 0 is "VALID" and the other is "SAME"
+ * @param out            The result of CNN layer, out->h=1.
+ * @return               The result of CNN layer.
+ */
+dl_matrix2d_t *dl_basic_conv_layer(const dl_matrix2d_t *in, const dl_matrix2d_t *weight, const dl_matrix2d_t *bias, int filter_width, int filter_height, 
+                                   const int out_channels, const int stride_x, const int stride_y,  padding_state pad, const dl_matrix2d_t* out);
+
+
+/**
+ * @brief Do a basic CNN layer pass, quantised wersion.
+ *
+ * @Warning This just supports the single channel input image, and the output is single row matrix.
+            That is to say, the height of output is 1, and the weight of output is out_channels*out_image_width*out_image_height
+ *
+ * @param in             Input single channel image 
+ * @param weight         Weights of the neurons, weight->w = out_channels, weight->h = filter_width*filter_height,
+ * @param bias           Bias of the neurons.
+ * @param filter_height  The height of convolution kernel
+ * @param filter_width   The width of convolution kernel
+ * @param out_channels   The number of output channels of convolution kernel
+ * @param stride_x       The step length of the convolution window in x(width) direction
+ * @param stride_y       The step length of the convolution window in y(height) direction
+ * @param pad            One of `"VALID"` or `"SAME"`, 0 is "VALID" and the other is "SAME"
+ * @param out            The result of CNN layer, out->h=1
+ * @return               The result of CNN layer
+ */
+dl_matrix2d_t *dl_basic_conv_layer_quantised_weight(const dl_matrix2d_t *in, const dl_matrix2dq_t *weight, const dl_matrix2d_t *bias, int filter_width, int filter_height, 
+                                                     const int out_channels, const int stride_x, const int stride_y,  padding_state pad, const dl_matrix2d_t* out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32s2/dl_lib_coefgetter_if.h b/include/esp32s2/dl_lib_coefgetter_if.h
new file mode 100644
index 0000000..a21de8d
--- /dev/null
+++ b/include/esp32s2/dl_lib_coefgetter_if.h
@@ -0,0 +1,80 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_COEFGETTER_IF_H
+#define DL_LIB_COEFGETTER_IF_H
+
+#include "dl_lib_matrix.h"
+#include "dl_lib_matrixq.h"
+#include "dl_lib_matrixq8.h"
+#include "cJSON.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//Set this if the coefficient requested is a batch-normalization popvar matrix which needs to be preprocessed by
+//dl_batch_normalize_get_sqrtvar first.
+#define COEF_GETTER_HINT_BNVAR (1<<0)
+
+/*
+This struct describes the basic information of model data: 
+word_num: the number of wake words or speech commands
+word_list: the name list of wake words or speech commands
+thres_list: the threshold list of wake words or speech commands
+info_str: the string used to reflect the version and information of model data
+          which consist of the architecture of network, the version of model data, wake words and their threshold
+*/
+typedef struct {
+    int word_num;
+    char **word_list;
+    int *win_list;
+    float *thresh_list;
+    char *info_str;
+} model_info_t;
+
+/*
+Alphabet struct describes the basic grapheme or phoneme.
+item_num: the number of baisc item(grapheme or phonemr)
+items: the list of basic item
+*/
+typedef struct {
+    int item_num;
+    char **items;
+}alphabet_t;
+
+/*
+This struct describes a generic coefficient getter: a way to get the constant coefficients needed for a neural network.
+For the two getters, the name describes the name of the coefficient matrix, usually the same as the Numpy filename the
+coefficient was originally stored in. The arg argument can be used to optionally pass an additional user-defined argument
+to the getter (e.g. the directory to look for files in the case of the Numpy file loader getter). The hint argument
+is a bitwise OR of the COEF_GETTER_HINT_* flags or 0 when none is needed. Use the free_f/free_q functions to release the
+memory for the returned matrices, when applicable.
+*/
+typedef struct {
+    const dl_matrix2d_t* (*getter_f)(const char *name, void *arg, int hint);
+    const dl_matrix2dq_t* (*getter_q)(const char *name, void *arg, int hint);
+    const dl_matrix2dq8_t* (*getter_q8)(const char *name, void *arg, int hint);
+    void (*free_f)(const dl_matrix2d_t *m);
+    void (*free_q)(const dl_matrix2dq_t *m);
+    void (*free_q8)(const dl_matrix2dq8_t *m);
+    const model_info_t* (*getter_info)(void *arg);
+    const alphabet_t* (*getter_alphabet)(void *arg);
+    const cJSON* (*getter_config)(void *arg);
+} model_coeff_getter_t;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32s2/dl_lib_conv_queue.h b/include/esp32s2/dl_lib_conv_queue.h
new file mode 100644
index 0000000..7cb9bf9
--- /dev/null
+++ b/include/esp32s2/dl_lib_conv_queue.h
@@ -0,0 +1,180 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_CONV_QUEUE_H
+#define DL_LIB_CONV_QUEUE_H
+
+
+#include "dl_lib_matrix.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef float fptp_t;
+
+//Flags for matrices
+// #define DL_MF_FOREIGNDATA (0)  /*< Matrix *item data actually points to another matrix and should not be freed */
+
+//Float convolution FIFO queue. 
+typedef struct {
+    int n;          /*< the length of queue */
+    int c;          /*< the channel number of queue element*/
+    int front;      /*< the front(top) position of queue */
+    int flag;       /*< not used*/
+    fptp_t *item;   /*< Pointer to item array */
+} dl_conv_queue_t;
+
+/**
+ * @brief Allocate a convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The channel number of elements in the queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_conv_queue_t *dl_conv_queue_alloc(int n, int c);
+
+/**
+ * @brief Allocate a convolution queue from psram
+ *
+ * @param n     The length of queue
+ * @param c     The channel number of elements in the queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_conv_queue_t *dl_conv_queue_alloc_from_psram(int n, int c);
+
+/**
+ * @brief Free a convolution queue
+ *
+ * @param cq     The convolution queue to free
+ */
+void dl_conv_queue_free(dl_conv_queue_t *cq);
+
+void dl_conv_to_matrix2d(dl_conv_queue_t *cq, dl_matrix2d_t* out);
+
+/**
+ * @brief Move the front pointer of queue forward, 
+          the First(oldest) element become the last(newest) element, 
+ *
+ * @param cq    Input convolution queue
+ * @return      Pointer of oldest element  
+ */
+fptp_t *dl_conv_queue_pop(dl_conv_queue_t *cq);
+
+/**
+ * @brief  Remove the oldest element, then insert the input element at the end of queue
+ *
+ * @param cq     Input convolution queue
+ * @param item   The new element
+ */
+void dl_conv_queue_push(dl_conv_queue_t *cq, fptp_t* item);
+
+
+/**
+ * @brief   Get the pointer of element in the queue by offset
+ *
+ * @param cq      Input convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+fptp_t *dl_get_queue_item(dl_conv_queue_t *cq, int offset);
+
+/**
+ * @brief   Does a sigmoid operation on the one of element in the convolution queue.
+ * Gets the pointer of element in the convolution queue by offset, and does a sigmoid operation
+ * by this pointer, then return the pointer      
+ *
+ * @param cq      Input convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+fptp_t *dl_sigmoid_step(dl_conv_queue_t *cq, int offset);
+
+/**
+ * @brief   Does a tanh operation on the one of element in the convolution queue.
+ * Gets the pointer of element in the convolution queue by offset, and does a tanh operation
+ * by this pointer, then return the pointer  
+ *
+ * @param cq      Input convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+fptp_t *dl_tanh_step(dl_conv_queue_t *cq, int offset);
+
+/**
+ * @brief   Does a softmax operation on the one of element in the convolution queue.
+ * Gets the pointer of element in the convolution queue by offset, and does a softmax operation
+ * by this pointer, then return the pointer 
+ *
+ * @param cq      Input convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+fptp_t *dl_softmax_step(dl_conv_queue_t *cq, int offset);
+
+fptp_t *dl_relu_step(dl_conv_queue_t *cq, int offset);
+fptp_t *dl_relu_look(dl_matrix2d_t *cq, int offset);
+dl_matrix2d_t *dl_matrix_concat1(const dl_conv_queue_t *a, const dl_matrix2d_t *b);
+dl_matrix2d_t *dl_basic_lstm_layer1(const dl_conv_queue_t *in, dl_matrix2d_t *state_c, dl_matrix2d_t *state_h,
+                                   const dl_matrix2d_t *weight, const dl_matrix2d_t *bias);
+/**
+ * @brief Fast implement for 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        based on convolution queue.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is first element of output queue and should not be freed separately.
+ *
+ * @param in       Input convolution queue
+ * @param out      Output convolution queue
+ * @param rate     A positive int, the stride with which we sample input value 
+ * @param size     A positive int, the size of 1D-filter
+ * @param kernel   The kernel matrix of filter
+ * @param bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @return         The result of atrous convolution
+ */
+fptp_t *dl_atrous_conv1d_step(dl_conv_queue_t *in, dl_conv_queue_t *out, int rate, int size,
+                              dl_matrix2d_t* kernel, dl_matrix2d_t* bias);
+fptp_t *dl_look_conv_step(dl_conv_queue_t *in, dl_matrix2d_t *out, int rate, int size,
+                         dl_matrix2d_t* kernel, dl_matrix2d_t* bias);
+
+/**
+ * @brief Fast implement of dilation layer as follows
+ *
+ *               |-> [gate(sigmoid)] -|        
+ *      input -  |                    |-> (*) - output
+ *               |-> [filter(tanh)]  -|   
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is first element of output queue and should not be freed separately.
+ *
+ * @param in              Input convolution queue
+ * @param out             Output convolution queue
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param filter_kernel   The kernel matrix of filter
+ * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param gate_kernel     The kernel matrix of gate
+ * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
+ * @return                The result of dilation layer
+ */
+fptp_t *dl_dilation_layer(dl_conv_queue_t *in, dl_conv_queue_t *out, int rate, int size,
+                          dl_matrix2d_t* filter_kernel, dl_matrix2d_t* filter_bias,
+                          dl_matrix2d_t* gate_kernel, dl_matrix2d_t* gate_bias);
+
+
+void test_atrous_conv(int size, int rate, int in_channel, int out_channel);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32s2/dl_lib_convq8_queue.h b/include/esp32s2/dl_lib_convq8_queue.h
new file mode 100644
index 0000000..28c5da7
--- /dev/null
+++ b/include/esp32s2/dl_lib_convq8_queue.h
@@ -0,0 +1,303 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_CONVQ8_QUEUE_H
+#define DL_LIB_CONVQ8_QUEUE_H
+
+
+#include "dl_lib_matrixq.h"
+#include "dl_lib_matrixq8.h"
+#include "dl_lib_conv_queue.h"
+#include "dl_lib_convq_queue.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//[nch, n, c]
+typedef struct {
+    int n;           /*< the length of queue */
+    int c;           /*< the number of queue element*/
+    int front;       /*< the front(top) position of queue */
+    int nch;         /*< the channel of queue */
+    int exponent;    /*< The values in items should be multiplied by pow(2,exponent) 
+                         to get the real values */
+    q8tp_t *itemq;    /*< Pointer to item array */
+} dl_convq8_queue_t;
+
+/**
+ * @brief Allocate a fixed-point convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq8_queue_t *dl_convq8_queue_alloc(int n, int c);
+
+/**
+ * @brief Allocate a fixed-point convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @param c     The channel of queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq8_queue_t *dl_convq8_queue_alloc_mc(int n, int c, int nch);
+
+/**
+ * @brief Allocate a bit fixed-point convolution queue from PSRAM
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @param nch     The channel of queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq8_queue_t *dl_convq8_queue_alloc_mc_from_psram(int n, int c, int nch);
+
+/**
+ * @brief Free a fixed-point convolution queue
+ *
+ * @param cq     The fixed-point convolution queue to free
+ */
+void dl_convq8_queue_free(dl_convq8_queue_t *cq);
+
+/**
+ * @brief Set itemq of convolution queue to 0
+ *
+ * @param cq     The fixed-point convolution queue to free
+ */
+void dl_convq8_queue_bzero(dl_convq8_queue_t *cqm);
+
+/**
+ * @brief Move the front pointer of queue forward, 
+          the First(oldest) element become the last(newest) element, 
+ *
+ * @param cq    Input fixed-point convolution queue
+ * @return      Pointer of oldest element  
+ */
+q8tp_t *dl_convq8_queue_pop(dl_convq8_queue_t *cq);
+q8tp_t *dl_convq8_queue_popn(dl_convq8_queue_t *cq, int n);
+
+/**
+ * @brief  Insert the float-point element at the end of queue.
+ *         The precision of fixed-point numbers is described by the Qm.f notation,  
+ *
+ * @param cq     Input fixed-point convolution queue
+ * @param item   The float-point element
+ * @param m_bit  The number of integer bits including the sign bits
+ * @param f_bit  The number of fractional bits
+ */
+void dl_convq8_queue_push_by_qmf(dl_convq8_queue_t *cq, fptp_t* item, int m_bit, int f_bit);
+
+/**
+ * @brief   Get the pointer of element in the queue by offset
+ *
+ * @param cq      Input fixed-point convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+q8tp_t *dl_get_queue_itemq8(dl_convq8_queue_t *cq, int offset);
+
+/**
+ * @brief   Get the pointer of element in the queue by offset
+ *
+ * @param cq      Input fixed-point convolution queue
+ * @param offset  Offset from the front of the queue
+ * @param ch      Channel index of queue
+ * @return        Pointer of the element
+ */
+q8tp_t *dl_get_queue_itemq8_mc(dl_convq8_queue_t *cq, int offset, int ch);
+
+/**
+ * @brief Fast and quantised implement for 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        based on convolution queue.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param kernel          Kernel matrix of filter
+ * @param bias            The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param out_exponent    Shift ratio used in dot operation between two 16-bit fixed point vector 
+ * @param offset          Offset used to calculate the beginning of input conv queue  
+ * @param prenum          The num to control the parameter size of preload operation  
+ * @return                The result of atrous convolution
+ */
+void dl_atrous_conv1dq8_steps(dl_convq8_queue_t *in, dl_convq8_queue_t *out, int rate, int size,
+                            dl_matrix2dq8_t* kernel, dl_matrix2dq8_t* bias, 
+                            int out_exponent, int offset, int prenum);
+
+/**
+ * @brief Fast implement of dilation layer as follows
+ *
+ *               |-> [gate(sigmoid)] -|        
+ *      input -  |                    |-> (*) - output
+ *               |-> [filter(tanh)]  -|   
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param filter_kernel   The kernel matrix of filter
+ * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param gate_kernel     The kernel matrix of gate
+ * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
+ * @param offset          Offset used to calculate the beginning of input conv queue  
+ * @param prenum          The num to control the parameter size of preload operation  
+ * @return                The result of dilation layer
+ */
+void dl_dilation_layerq8_steps(dl_convq8_queue_t *in, dl_convq8_queue_t *out, int rate, int size,
+                                dl_matrix2dq8_t* filter_kernel, dl_matrix2dq8_t* filter_bias,
+                                dl_matrix2dq8_t* gate_kernel, dl_matrix2dq8_t* gate_bias,
+                                int offset, int prenum);
+
+
+
+
+dl_conv_queue_t *dl_convq8_queue_add(dl_convq8_queue_t *cq1, dl_convq8_queue_t *cq2);
+
+int8_t dl_sigmoid_lutq8(int in);
+/**
+ * @brief Allocate a 8-bit fixed-point Multi-Channel convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @param nch　　The channel number
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq8_queue_t **dl_convq8_queue_mc_alloc(int n, int c, int nch);
+
+/**
+ * @brief Free a 8-bit fixed-point Multi-Channel convolution queue
+ *
+ * @param cqm     The fixed-point convolution queue to free
+ * @param nch     The channel number
+ */
+void dl_convq8_queue_mc_free(dl_convq8_queue_t **cqm, int nch);
+
+/**
+ * @brief Tanh activation function for 8-bit fixed-point Multi-Channel convolution queue input
+ *
+ * @param cqm     Input 8-bit fixed-point Multi-Channel convolution queue
+ * @param offset  Offset used to calculate the beginning of input conv queue 
+ * @param nch     The channel number
+ */
+void dl_tanh_convq8_mc(dl_convq8_queue_t **cqm, int offset, int nch);
+
+/**
+ * @brief Fast and quantised 16-bit implement for Multi-channel 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        Usually, this layer is used as first layer for 8-bit network.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          Input is a 16-bit queue point, Output is an 8-bit queue point.
+ *
+ * @param in              Input 16bit fixed-point convolution queue array
+ * @param out             Output 8bit fixed-point convolution queue array
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param kernel          The kernel matrix of filter
+ * @param bias            The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param out_exponent    Exponent of output
+ * @param offset          Offset used to calculate the beginning of input conv queue  
+ * @param prenum          The num to control the parameter size of preload operation            
+ */
+void dl_atrous_conv1dq8_16in_mc_steps(dl_convq_queue_t **in, dl_convq8_queue_t **out, int nch, int rate, int size,
+                                        dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int out_exponent, int offset, int prenum);
+
+/**
+ * @brief Fast and quantised 8-bit implement for Multi-channel 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        based on convolution queue.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input 8bit fixed-point convolution queue array
+ * @param out             Output 8bit fixed-point convolution queue array
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param kernel          The kernel matrix of filter
+ * @param bias            The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param out_exponent    Exponent of output
+ * @param offset          Offset used to calculate the beginning of input conv queue 
+ * @param prenum          The num to control the parameter size of preload operation            
+ */
+void dl_atrous_conv1dq8_mc_steps(dl_convq8_queue_t **in, dl_convq8_queue_t **out,
+                                int nch, int rate, int size,
+                                dl_matrix2dq8_t* kernel, dl_matrix2dq8_t* bias, 
+                                int out_exponent, int offset, int prenum);
+
+/**
+ * @brief Fast implement of 8-bit dilation layer as follows
+ *
+ *               |-> [gate(sigmoid)] -|        
+ *      input -  |                    |-> (*) - output
+ *               |-> [filter(tanh)]  -|   
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input 8-bit fixed-point convolution queue
+ * @param out             Output 8-bit fixed-point convolution queue
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param filter_kernel   The kernel matrix of filter
+ * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param gate_kernel     The kernel matrix of gate
+ * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
+ * @param offset          Offset used to calculate the beginning of input conv queue 
+ * @param prenum          The num to control the parameter size of preload operation
+ */
+void dl_dilation_layerq8_mc_steps(dl_convq8_queue_t **in, dl_convq8_queue_t **out, int nch, int rate, int size,
+                                    dl_matrix2dq8_t* filter_kernel, dl_matrix2dq8_t* filter_bias,
+                                    dl_matrix2dq8_t* gate_kernel, dl_matrix2dq8_t* gate_bias,
+                                    int offset, int prenum);    
+
+void dl_convq8_queue_mc_bzero(dl_convq8_queue_t **cqm, int nch);
+
+
+
+dl_convq8_queue_t *dl_convq8_queue_alloc_from_psram(int n, int c);
+
+qtp_t *dl_dilation_layerq16_8(dl_convq_queue_t *in, dl_convq8_queue_t *out, int rate, int size,
+                            dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+                            dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias, int prenum);
+
+
+qtp_t *dl_dilation_layerq8(dl_convq8_queue_t *in, dl_convq8_queue_t *out, int rate, int size,
+                            dl_matrix2dq8_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+                            dl_matrix2dq8_t* gate_kernel, dl_matrix2dq_t* gate_bias, int prenum);
+
+dl_matrix2dq8_t *dl_convq8_lstm_layer(const dl_convq8_queue_t *in, dl_convq8_queue_t *out, dl_matrix2dq8_t *state_c,
+                                      dl_matrix2dq8_t *state_h, const dl_matrix2dq8_t *in_weight, const dl_matrix2dq8_t *h_weight,
+                                      const dl_matrix2dq_t *bias, int prenum);
+
+qtp_t *dl_atrous_conv1dq8_16_s3(dl_convq8_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                                 dl_matrix2dq8_t* kernel, dl_matrix2dq_t* bias, int prenum);
+
+void print_convq8(dl_convq8_queue_t *cq, int offset);
+void print_convq(dl_convq_queue_t *cq, int offset);
+void dl_relu_convq8(dl_convq8_queue_t *cq);
+
+void lstmq8_free(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32s2/dl_lib_convq_queue.h b/include/esp32s2/dl_lib_convq_queue.h
new file mode 100644
index 0000000..ff190fe
--- /dev/null
+++ b/include/esp32s2/dl_lib_convq_queue.h
@@ -0,0 +1,382 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_CONVQ_QUEUE_H
+#define DL_LIB_CONVQ_QUEUE_H
+
+#include "dl_lib_matrixq.h"
+#include "dl_lib_conv_queue.h"
+#include "dl_lib.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//fixed-point convolution FIFO queue. 
+//[nch, n, c]
+typedef struct {
+    int n;           /*< the length of queue */
+    int c;           /*< the number of queue element*/
+    int front;       /*< the front(top) position of queue */
+    int nch;         /*< the multiple of queue*/
+    int exponent;    /*< The values in items should be multiplied by pow(2,exponent) 
+                         to get the real values */
+    qtp_t *itemq;    /*< Pointer to item array */
+} dl_convq_queue_t;
+
+/**
+ * @brief Allocate a fixed-point convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq_queue_t *dl_convq_queue_alloc(int n, int c);
+
+/**
+ * @brief Allocate a fixed-point convolution queue from PSRAM
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq_queue_t *dl_convq_queue_alloc_from_psram(int n, int c);
+
+/**
+ * @brief Allocate a fixed-point multi-channel convolution queue
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @param nch   The channel of conv queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq_queue_t *dl_convq_queue_alloc_mc(int n, int c, int nch);
+
+/**
+ * @brief Allocate a fixed-point multi-channel convolution queue from PSRAM
+ *
+ * @param n     The length of queue
+ * @param c     The number of elements in the queue
+ * @param nch   The channel of conv queue
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq_queue_t *dl_convq_queue_alloc_mc_from_psram(int n, int c, int nch);
+
+
+void dl_convq_to_matrix2dq(dl_convq_queue_t *cq, dl_matrix2dq_t* out, int row);
+
+/**
+ * @brief Free a fixed-point convolution queue
+ *
+ * @param cq     The fixed-point convolution queue to free
+ */
+void dl_convq_queue_free(dl_convq_queue_t *cq);
+
+/**
+ * @brief Set itemq of convolution queue to 0
+ *
+ * @param cq     The fixed-point convolution queue point
+ */
+void dl_convq_queue_bzero(dl_convq_queue_t *cq);
+
+/**
+ * @brief Move the front pointer of queue forward, 
+          the First(oldest) element become the last(newest) element, 
+ *
+ * @param cq    Input fixed-point convolution queue
+ * @return      Pointer of oldest element  
+ */
+qtp_t *dl_convq_queue_pop(dl_convq_queue_t *cq);
+qtp_t *dl_convq_queue_popn(dl_convq_queue_t *cq, int n);
+/**
+ * @brief  Remove the oldest element, then insert the input element at the end of queue
+ *
+ * @param cq     Input fixed-point convolution queue
+ * @param item   The new element
+ */
+void dl_convq_queue_push(dl_convq_queue_t *cq, dl_matrix2dq_t *a, int shift);
+
+/**
+ * @brief  Insert the float-point element at the end of queue.
+ *         The precision of fixed-point numbers is described by the Qm.f notation,  
+ *
+ * @param cq     Input fixed-point convolution queue
+ * @param item   The float-point element
+ * @param m_bit  The number of integer bits including the sign bits
+ * @param f_bit  The number of fractional bits
+ */
+void dl_convq_queue_push_by_qmf(dl_convq_queue_t *cq, fptp_t* item, int m_bit, int f_bit);
+
+void dl_convq16_queue_push_by_qmf(dl_convq_queue_t *cq, fptp_t* item, int m_bit, int f_bit);
+
+dl_conv_queue_t *dl_queue_from_convq(dl_convq_queue_t *cq1);
+
+/**
+ * @brief   Get the pointer of element in the queue by offset
+ *
+ * @param cq        Input fixed-point convolution queue
+ * @param last_num  Offset from the front of the queue
+ * @return          Pointer of the element
+ */
+qtp_t *dl_get_queue_itemq(dl_convq_queue_t *cq, int last_num);
+
+/**
+ * @brief   Get the pointer of element in the queue by offset
+ *
+ * @param cq        Input fixed-point convolution queue
+ * @param offset    Offset from the front of the queue
+ * @param ch        Channel index of convolution queue 
+ * @return          Pointer of the element
+ */
+qtp_t *dl_get_queue_itemq_mc(dl_convq_queue_t *cq, int offset, int ch);
+
+/**
+ * @brief   Does a tanh operation on the one of element in the convolution queue.
+ *          Gets the pointer of element in the convolution queue by offset, and does a 
+ *          tanh operation by this pointer, then return the pointer 
+ *
+ * @param cq      Input fixed-point convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+void dl_tanh_convq(dl_convq_queue_t *cq, int offset);
+
+/**
+ * @brief   Does a tanh operation on the one of element in multi channel convolution queue.
+ *          Gets the pointer of element in the convolution queue by offset, and does a 
+ *          tanh operation by this pointer, then return the pointer 
+ *
+ * @param cq      Input fixed-point multi channnel convolution queue
+ * @param offset  Offset from the front of the queue
+ * @param nch     The channel number of cqm
+ * @return        Pointer of the element
+ */
+void dl_tanh_convq_mc(dl_convq_queue_t **cqm, int offset, int nch);
+
+/**
+ * @brief   Does a relu operation on the one of element in the convolution queue.
+ *          Gets the pointer of element in the convolution queue by offset, and does a 
+ *          relu operation by this pointer, then return the pointer 
+ *
+ * @param cq      Input fixed-point convolution queue
+ * @param offset  Offset from the front of the queue
+ * @return        Pointer of the element
+ */
+void dl_relu_convq(dl_convq_queue_t *cq, fptp_t clip, int last_num);
+
+/**
+ * @brief   Does a softmax operation on the one of element in the convolution queue.
+ *          Gets the pointer of element in the convolution queue by offset, input data
+            stay as it is. Results are saved into the *out* array. 
+ *
+ * @param cq      Input fixed-point convolution queue
+ * @param offset  Offset from the front of the queue
+ * @param out     Old array to re-use. Passing NULL will allocate a new matrix.
+ * @return        softmax results
+ */
+fptp_t * dl_softmax_step_q(dl_convq_queue_t *cq, int offset, fptp_t *out);
+
+/**
+ * @brief Fast and quantised implement for 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        based on convolution queue.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in       Input fixed-point convolution queue
+ * @param out      Output fixed-point convolution queue
+ * @param rate     A positive int, the stride with which we sample input value 
+ * @param size     A positive int, the size of 1D-filter
+ * @param kernel   The kernel matrix of filter
+ * @param bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param shift    Shift ratio used in dot operation between two 16-bit fixed point vector 
+ * @return         The result of atrous convolution
+ */
+qtp_t * dl_atrous_conv1dq(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                          dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int shift, int prenum);
+
+/**
+ * @brief Fast implement of dilation layer as follows
+ *
+ *               |-> [gate(sigmoid)] -|        
+ *      input -  |                    |-> (*) - output
+ *               |-> [filter(tanh)]  -|   
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param filter_kernel   The kernel matrix of filter
+ * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param gate_kernel     The kernel matrix of gate
+ * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
+ * @param filter_shift          Shift ratio used in filter operation between two 16-bit fixed point vector
+ * @param gate_shift            Shift ratio used in gate operation between two 16-bit fixed point vector
+ * @return                The result of dilation layer
+ */
+qtp_t *dl_dilation_layerq_steps(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+   dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+   dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias,
+   int filter_shift, int gate_shift, int offset, int prenum);
+
+
+qtp_t *dl_dilation_layerq(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                          dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+                          dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias,
+                          int filter_shift, int gate_shift, int prenum);
+
+qtp_t *dl_dilation_layerq16(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                             dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
+                             dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias, int prenum);
+
+
+qtp_t *dl_atrous_conv1dq_steps(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
+                                 dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int shift, int offset, int prenum);
+
+/**
+ * @brief Add a pair of fixed-point convolution queue item-by-item, and return float-point convolution queue
+ *
+ * @param cq1      First fixed-point convolution queue
+ * @param cq2      Seconf fixed-point convolution queue
+ * @return         The result of float-point convolution queue
+ */
+dl_conv_queue_t *dl_convq_queue_add(dl_convq_queue_t *cq1, dl_convq_queue_t *cq2);
+
+/**
+ * @brief Fast implement of LSTM layer by dl_atrous_conv1dq function
+ *
+ * @Warning LSTM kernel is split into two part, the first part input is the last layer output, 
+ *           and kernel is parameter *in_weight*. The second part input is the last frame LSTM output,
+ *           the kernel is parameters *h_weight*.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param state_c         Internal state of the LSTM network
+ * @param state_h         Internal state (previous output values) of the LSTM network
+ * @param in_weight       the LSTM kernel needed by first part
+ * @param h_weight        the LSTM kernel needed by second part
+ * @param bias            The bias matrix of LSTM. Can be NULL if a bias of 0 is required.
+ * @in_shift              Shift ratio used in first part
+ * @h_shift               Shift ratio used in second part
+ * @return                The result of LSTM layer
+ */
+dl_matrix2dq_t *dl_convq_lstm_layer(const dl_convq_queue_t *in, dl_convq_queue_t *out, dl_matrix2dq_t *state_c,
+                                    dl_matrix2dq_t *state_h, const dl_matrix2dq_t *in_weight, const dl_matrix2dq_t *h_weight,
+                                    const dl_matrix2dq_t *bias, int in_shift, int h_shift, int prenum);
+dl_matrix2dq_t *dl_basic_lstm_layer1_q(const dl_convq_queue_t *in, dl_matrix2dq_t *state_c, dl_matrix2dq_t *state_h,
+                                       const dl_matrix2dq_t *weight, const dl_matrix2dq_t *bias, int step, int shift);
+
+dl_matrix2dq_t *dl_convq16_lstm_layer(dl_convq_queue_t *in, dl_convq_queue_t *out, dl_matrix2dq_t *state_c,
+                                       dl_matrix2dq_t *state_h, dl_matrix2dq_t *in_weight, dl_matrix2dq_t *h_weight,
+                                       dl_matrix2dq_t *bias, int prenum);
+
+/**
+ * @brief Allocate a fixed-point multi channel convolution queue 
+ *
+ * @param n     The length of queue
+ * @param c     The channel number of elements in the queue
+ * @param nch   the channel numbet of convolution queue 
+ * @return      The convolution queue, or NULL if out of memory
+ */
+dl_convq_queue_t **dl_convq_queue_mc_alloc(int n, int c, int nch);
+
+/**
+ * @brief Free a fixed-point multi channel convolution queue
+ *
+ * @param cqm     The fixed-point convolution queue to free
+ * @param nch     The channel number of cqm
+ */
+void dl_convq_queue_mc_free(dl_convq_queue_t **cqm, int nch);
+
+/**
+ * @brief Fast and quantised implement for 1D atrous convolution (a.k.a. convolution with holes or dilated convolution)
+ *        based on convolution queue.
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in       Input fixed-point convolution queue
+ * @param out      Output fixed-point convolution queue
+ * @param nch      The channel number of input 
+ * @param rate     A positive int, the stride with which we sample input value 
+ * @param size     A positive int, the size of 1D-filter
+ * @param kernel   The kernel matrix of filter
+ * @param bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param shift    Shift ratio used in dot operation between two 16-bit fixed point vector 
+ * @param offset   the offset to calculate input convq
+ * @param prenum   the preload size, 0: do not use preload function
+ * @return         The result of atrous convolution
+ */
+qtp_t *dl_atrous_conv1dq_mc_steps(  dl_convq_queue_t **in,
+                                    dl_convq_queue_t **out,
+									int nch,
+									int rate,
+									int size,
+                                    dl_matrix2dq_t* kernel,
+									dl_matrix2dq_t* bias,
+									int shift,
+									int offset,
+									int prenum);
+
+/**
+ * @brief Fast implement of dilation layer as follows for multi channel input
+ *
+ *               |-> [gate(sigmoid)] -|        
+ *      input -  |                    |-> (*) - output
+ *               |-> [filter(tanh)]  -|   
+ *
+ * @Warning All input and output convolution queue and matrix should be allocated. The return pointer
+ *          is last element of output queue and should not be freed separately.
+ *
+ * @param in              Input fixed-point convolution queue
+ * @param out             Output fixed-point convolution queue
+ * @param nch             The channel number of input 
+ * @param rate            A positive int, the stride with which we sample input value 
+ * @param size            A positive int, the size of 1D-filter
+ * @param filter_kernel   The kernel matrix of filter
+ * @param filter_bias     The bias matrix of filter. Can be NULL if a bias of 0 is required.
+ * @param gate_kernel     The kernel matrix of gate
+ * @param gate_bias       The bias matrix of gate. Can be NULL if a bias of 0 is required.
+ * @param filter_shift    Shift ratio used in filter operation between two 16-bit fixed point vector
+ * @param gate_shift      Shift ratio used in gate operation between two 16-bit fixed point vector
+ * @param offset          The offset to calculate input convq
+ * @param prenum          The preload size, 0: do not use preload function
+ * @return                The result of dilation layer
+ */
+qtp_t *dl_dilation_layerq_mc_steps( dl_convq_queue_t **in, 
+									dl_convq_queue_t **out,
+									int nch,
+									int rate,
+									int size,
+                                    dl_matrix2dq_t* filter_kernel,
+									dl_matrix2dq_t* filter_bias,
+                                    dl_matrix2dq_t* gate_kernel,
+									dl_matrix2dq_t* gate_bias,
+                                    int filter_shift,
+									int gate_shift,
+									int offset,
+									int prenum);
+
+void test_atrous_convq(int size, int rate, int in_channel, int out_channel);
+void test_lstm_convq(int size, int in_dim, int lstm_cell);
+void dl_nn_tanh_i162(dl_convq_queue_t **cqm, int offset, int nch);
+void dl_copy_queue_item_by_qmf(dl_convq_queue_t *cq, fptp_t* item, int m_bit, int f_bit, int offset, int ch);
+void dl_convq_queue_mc_bzero(dl_convq_queue_t **cqm, int nch);
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32s2/dl_lib_matrix.h b/include/esp32s2/dl_lib_matrix.h
new file mode 100644
index 0000000..59f7d79
--- /dev/null
+++ b/include/esp32s2/dl_lib_matrix.h
@@ -0,0 +1,257 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_MATRIX_H
+#define DL_LIB_MATRIX_H
+
+#ifdef ESP_PLATFORM
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+#include "freertos/queue.h"
+#include "esp_system.h"
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef float fptp_t;
+
+#if CONFIG_BT_SHARE_MEM_REUSE
+extern multi_heap_handle_t gst_heap;
+#endif
+
+//Flags for matrices
+#define DL_MF_FOREIGNDATA 1  /*< Matrix pointer and item data actually points to another matrix and should not be freed */
+#define DL_MF_FOREIGNITEM 2  /*< Only item data actually points to another matrix and should not be freed */
+
+//'Normal' float matrix
+typedef struct {
+    int w;          /*< Width */
+    int h;          /*< Height */
+    int stride;     /*< Row stride, essentially how many items to skip to get to the same position in the next row */
+    int flags;      /*< Flags. OR of DL_MF_* values */
+    fptp_t *item;   /*< Pointer to item array */
+} dl_matrix2d_t;
+
+//Macro to quickly access the raw items in a matrix
+#define DL_ITM(m, x, y) m->item[(x)+(y)*m->stride]
+
+
+/**
+ * @brief Allocate a matrix
+ *
+ * @param w     Width of the matrix
+ * @param h     Height of the matrix
+ * @return The matrix, or NULL if out of memory
+ */
+dl_matrix2d_t *dl_matrix_alloc(int w, int h);
+
+
+/**
+ * @brief Free a matrix
+ * Frees the matrix structure and (if it doesn't have the DL_MF_FOREIGNDATA flag set) the m->items space as well.
+ *
+ * @param m     Matrix to free
+ */
+void dl_matrix_free(dl_matrix2d_t *m);
+
+/**
+ * @brief Zero out the matrix
+ * Sets all entries in the matrix to 0.
+ *
+ * @param m     Matrix to zero
+ */
+void dl_matrix_zero(dl_matrix2d_t *m);
+
+/**
+ * @brief Copy the matrix into psram
+ * Copy the matrix from flash or iram/psram into psram
+ *
+ * @param m     Matrix to zero
+ */
+dl_matrix2d_t *dl_matrix_copy_to_psram(const dl_matrix2d_t *m);
+
+/**
+ * @brief Generate a new matrix using a range of items from an existing matrix.
+ * When using this, the data of the new matrix is not allocated/copied but it re-uses a pointer
+ * to the existing data. Changing the data in the resulting matrix, as a result, will also change
+ * the data in the existing matrix that has been sliced.
+ *
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @param in    Old matrix (with foreign data) to re-use. Passing NULL will allocate a new matrix.
+ * @return The resulting slice matrix, or NULL if out of memory
+ */
+dl_matrix2d_t *dl_matrix_slice(const dl_matrix2d_t *src, int x, int y, int w, int h, dl_matrix2d_t *in);
+
+/**
+ * @brief select a range of items from an existing matrix and flatten them into one dimension.
+ *
+ * @Warning The results are flattened in row-major order.
+ *   
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @param in    Old matrix to re-use. Passing NULL will allocate a new matrix.
+ * @return  The resulting flatten matrix, or NULL if out of memory
+ */
+dl_matrix2d_t *dl_matrix_flatten(const dl_matrix2d_t *src, int x, int y, int w, int h, dl_matrix2d_t *in);
+
+/**
+ * @brief Generate a matrix from existing floating-point data
+ *
+ * @param w     Width of resulting matrix
+ * @param h     Height of resulting matrix
+ * @param data  Data to populate matrix with
+ * @return A newaly allocated matrix populated with the given input data, or NULL if out of memory.
+ */
+dl_matrix2d_t *dl_matrix_from_data(int w, int h, int stride, const void *data);
+
+
+/**
+ * @brief Multiply a pair of matrices item-by-item: res=a*b
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Multiplicated data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrix_mul(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *res);
+
+/**
+ * @brief Do a dotproduct of two matrices : res=a.b
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ */
+void dl_matrix_dot(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *res);
+
+/**
+ * @brief Add a pair of matrices item-by-item: res=a-b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Added data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrix_add(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *out);
+
+
+/**
+ * @brief Divide a pair of matrices item-by-item: res=a/b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Divided data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrix_div(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *out);
+
+/**
+ * @brief Subtract a matrix from another, item-by-item: res=a-b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Subtracted data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrix_sub(const dl_matrix2d_t *a, const dl_matrix2d_t *b, dl_matrix2d_t *out);
+
+/**
+ * @brief Add a constant to every item of the matrix
+ *
+ * @param subj  Matrix to add the constant to
+ * @param add   The constant
+ */
+void dl_matrix_add_const(dl_matrix2d_t *subj, const fptp_t add);
+
+
+/**
+ * @brief Concatenate the rows of two matrices into a new matrix
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @return A newly allocated array with as avlues a|b
+ */
+dl_matrix2d_t *dl_matrix_concat(const dl_matrix2d_t *a, const dl_matrix2d_t *b);
+
+dl_matrix2d_t *dl_matrix_concat_h( dl_matrix2d_t *a, const dl_matrix2d_t *b);
+
+/**
+ * @brief Print the contents of a matrix to stdout. Used for debugging.
+ *
+ * @param a     The matrix to print.
+ */
+void dl_printmatrix(const dl_matrix2d_t *a);
+
+/**
+ * @brief Return the average square error given a correct and a test matrix.
+ *
+ * ...Well, more or less. If anything, it gives an indication of the error between
+ * the two. Check the code for the exact implementation.
+ *
+ * @param a     First of the two matrices to compare
+ * @param b     Second of the two matrices to compare
+ * @return value indicating the relative difference between matrices
+ */
+float dl_matrix_get_avg_sq_err(const dl_matrix2d_t *a, const dl_matrix2d_t *b);
+
+
+
+/**
+ * @brief Check if two matrices have the same shape, that is, the same amount of rows and columns
+ *
+ * @param a     First of the two matrices to compare
+ * @param b     Second of the two matrices to compare
+ * @return true if the two matrices are shaped the same, false otherwise.
+ */
+int dl_matrix_same_shape(const dl_matrix2d_t *a, const dl_matrix2d_t *b);
+
+
+/**
+ * @brief Get a specific item from the matrix
+ *
+ * Please use these for external matrix access instead of DL_ITM
+ *
+ * @param m     Matrix to access
+ * @param x     Column address
+ * @param y     Row address
+ * @return Value in that position
+ */
+inline static fptp_t dl_matrix_get(const dl_matrix2d_t *m, const int x, const int y) { 
+    return DL_ITM(m, x, y);
+}
+
+/**
+ * @brief Set a specific item in the matrix to the given value
+ *
+ * Please use these for external matrix access instead of DL_ITM
+ *
+ * @param m     Matrix to access
+ * @param x     Column address
+ * @param y     Row address
+ * @param val   Value to write to that position
+ */
+inline static void dl_matrix_set(dl_matrix2d_t *m, const int x, const int y, fptp_t val) { 
+    DL_ITM(m, x, y)=val;
+}
+
+void matrix_get_range(const dl_matrix2d_t *m, fptp_t *rmin, fptp_t *rmax);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/include/esp32s2/dl_lib_matrixq.h b/include/esp32s2/dl_lib_matrixq.h
new file mode 100644
index 0000000..8ad397b
--- /dev/null
+++ b/include/esp32s2/dl_lib_matrixq.h
@@ -0,0 +1,387 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_MATRIXQ_H
+#define DL_LIB_MATRIXQ_H
+
+#include <stdint.h>
+#include "dl_lib_matrix.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int16_t qtp_t;
+
+//Quantized matrix. Uses fixed numbers and has the storage for the rows/columns inverted 
+//for easy use as a multiplicand without stressing out the flash cache too much.
+typedef struct {
+    int w;
+    int h;
+    int stride; //Normally equals h, not w!
+    int flags;
+    int exponent; //The values in items should be multiplied by pow(2,exponent) to get the real values.
+    qtp_t *itemq;
+} dl_matrix2dq_t;
+
+#define DL_QTP_SHIFT 15
+#define DL_QTP_RANGE ((1<<DL_QTP_SHIFT)-1)
+#define DL_ITMQ(m, x, y) m->itemq[(y)+(x)*m->stride]
+#define DL_QTP_EXP_NA 255 //non-applicable exponent because matrix is null
+
+#define DL_SHIFT_AUTO 32
+
+/**
+ * @info About quantized matrices and shift values
+ *
+ * Grab a coffee (or tea, or hot water)  and sit down when you read this for the first 
+ * time. Quantized matrices can speed up your operations, but come with some quirks, and
+ * it's good to understand how they work before using them.
+ *
+ * The data in the quantized matrix type is stored similarily to floating-point types:
+ * when storing a real value, the value is stored as a mantissa (base number) and an
+ * exponent. The 'real' value that can be re-derived from those two numbers is something
+ * similar to mantissa*2^exponent. Up to this point, there's not that much difference from 
+ * the standard floating point implementations like e.g. IEEE-754.
+ *
+ * The difference with respect to quantized matrices is that for a quantized matrix, it is 
+ * assumed all values stored have more-or-less the same order of magnitude. This allows the
+ * matrix to only store all the mantissas, while the exponents are shared; there is only one 
+ * exponent for the entire matrix. This makes it quicker to handle matrix operations - the
+ * logic to fix the exponents only needs to happen once, while the rest can be done in simple
+ * integer arithmetic. It also nets us some memory savings - while normally a floating point
+ * number is 32-bit, storing only 16-bit mantissas as the matrix items almost halves the 
+ * memory requirements.
+ *
+ * While most of the details of handling the intricacies of the quantized matrixes are done
+ * transparently by the code in dl_lib_matrixq.c, some implementation details leak out, 
+ * specifically in places where addition/subtraction/division happens.
+ *
+ * The problem is that the routines do not know what the size of the resulting operation is. For
+ * instance, when adding two matrices of numbers, the resulting numbers *could* be large enough
+ * to overflow the mantissa of the result if the exponent is the same. However, if by default we
+ * assume the mantissas needs to be scaled back, we may lose precision.
+ *
+ * In order to counter this, all operations that have this issue have a ``shift`` argument. If 
+ * the argument is zero, the routine will be conservative, that is, increase the exponent of 
+ * the result to such an extent it's mathematically impossible a value in the result will exceed
+ * the maximum value that can be stored. However, when this argument is larger than zero, the
+ * algorithm will hold back on this scaling by the indicated amount of bits, preserving precision
+ * but increasing the chance of some of the calculated values not fitting in the mantissa anymore.
+ * If this happens, the value will be clipped to the largest (or, for negative values, smallest)
+ * value possible. (Neural networks usually are okay with this happening for a limited amount
+ * of matrix indices).
+ *
+ * For deciding on these shift values, it is recommended to start with a shift value of one, then
+ * use dl_matrixq_check_sanity on the result. If this indicates clipping, lower the shift value. 
+ * If it indicates bits are under-used, increase it. Note that for adding and subtraction, only
+ * shift values of 0 or 1 make sense; these routines will error out if you try to do something
+ * else.
+ *
+ * For neural networks and other noise-tolerant applications, note that even when 
+ * dl_matrixq_check_sanity does not indicate any problems, twiddling with the shift value may lead
+ * to slightly improved precision. Feel free to experiment.
+ **/
+
+
+/**
+ * @brief Allocate a matrix
+ *
+ * @param w     Width of the matrix
+ * @param h     Height of the matrix
+ * @return The matrix, or NULL if out of memory
+ */
+dl_matrix2dq_t *dl_matrixq_alloc(int w, int h);
+dl_matrix2dq_t *dl_matrixq_alloc_psram(int w, int h);
+/**
+ * @brief Convert a floating-point matrix to a quantized matrix
+ *
+ * @param m     Floating-point matrix to convert
+ * @param out   Quantized matrix to re-use. If NULL, allocate a new one.
+ * @Return The quantized version of the floating-point matrix
+ */
+dl_matrix2dq_t *dl_matrixq_from_matrix2d(const dl_matrix2d_t *m, dl_matrix2dq_t *out);
+
+/**
+ * TODO: DESCRIBE THIS FUNCTION
+ */
+dl_matrix2dq_t *dl_matrixq_from_matrix2d_by_qmf(const dl_matrix2d_t *m, dl_matrix2dq_t *out, int m_bit, int f_bit);
+
+
+/**
+ * @brief Convert a quantized matrix to a floating-point one.
+ *
+ * @param m     Floating-point matrix to convert
+ * @param out   Quantized matrix to re-use. If NULL, allocate a new one.
+ * @Return The quantized version of the floating-point matrix
+ **/
+dl_matrix2d_t *dl_matrix2d_from_matrixq(const dl_matrix2dq_t *m, dl_matrix2d_t *out);
+
+
+/**
+ * @brief Free a quantized matrix
+ * Frees the matrix structure and (if it doesn't have the DL_MF_FOREIGNDATA flag set) the m->items space as well.
+ *
+ * @param m     Matrix to free
+ */
+void dl_matrixq_free(dl_matrix2dq_t *m);
+
+/**
+ * @brief Zero out the matrix
+ * Sets all entries in the matrix to 0.
+ *
+ * @param m     Matrix to zero
+ */
+void dl_matrixq_zero(dl_matrix2dq_t *m);
+
+/**
+ * @brief Copy the matrix into psram
+ * Copy the matrix from flash or iram/psram into psram
+ *
+ * @param m     Matrix to copy
+ */
+dl_matrix2dq_t *dl_matrixq_copy_to_psram(const dl_matrix2dq_t *m);
+
+/**
+ * @brief Do a dotproduct of two quantized matrices : res=a.b, Result is a fixed-point matrix.
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ * @param shift Shift ratio
+ */
+void dl_matrixq_dot(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res, int shift);
+
+/**
+ * @brief Do a dotproduct of two quantized matrices: res=a.b, Result is a floating-point matrix.
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ */
+void dl_matrixq_dot_matrix_out(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2d_t *res);
+
+/**
+ * @brief Do a dotproduct of two quantized matrices : res=a.b. This always uses the simple & stupid C algo for the dot product.
+ *
+ * Result is a fixed-point matrix. 
+ *
+ * Use this only if you expect something is wrong with the accelerated routines that dl_matrixq_dot calls; this function can be
+ * much slower than dl_matrixq_dot .
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ * @param shift Shift ratio
+ */
+void dl_matrixq_dot_c_impl(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res, int shift);
+
+/**
+ * @brief Do a dotproduct of two quantized matrices : res=a.b. This always uses the simple & stupid C algo for the dot product. 
+ *
+ * Result is a floating-point matrix. 
+ *
+ * Use this only if you expect something is wrong with the accelerated routines that dl_matrixq_dot_matrix_out calls; this function can be
+ * much slower than dl_matrixq_dot_matrix_out.
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Dotproduct data. *Must* be a *different* matrix from a or b!
+ */
+void dl_matrixq_dot_matrix_out_c_impl(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2d_t *res);
+
+/**
+ * @brief Do a dotproduct of a floating point and a quantized matrix. Result is a floating-point matrix.
+ *
+ * @param a     First multiplicand; float matrix
+ * @param b     Second multiplicand; quantized matrix
+ * @param res   Dotproduct data; float matrix. *Must* be a *different* matrix from a or b!
+ */
+void dl_matrix_matrixq_dot(const dl_matrix2d_t *a, const dl_matrix2dq_t *b, dl_matrix2d_t *res);
+
+
+/**
+ * @brief Print the contents of a quantized matrix to stdout. Used for debugging.
+ *
+ * @param a     The matrix to print.
+ */
+void dl_printmatrixq(const dl_matrix2dq_t *a);
+
+
+/**
+ * @brief Add a pair of quantizedmatrices item-by-item: res=a-b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Added data. Can be equal to a or b to overwrite that.
+ * @param shift Shift value. Only 0 or 1 makes sense here. <ToDo: check>
+ */
+void dl_matrixq_add(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res, int shift);
+
+/**
+ * @brief Generate a new matrix using a range of items from an existing matrix.
+ * When using this, the data of the new matrix is not allocated/copied but it re-uses a pointer
+ * to the existing data. Changing the data in the resulting matrix, as a result, will also change
+ * the data in the existing matrix that has been sliced.
+ *
+ * @Warning In contrast to the floating point equivalent of this function, the fixed-point version
+ * of this has the issue that as soon as the output exponent of one of the slices changes, the data
+ * in the sliced matrix gets corrupted (because the exponent of that matrix is still the same.) If you
+ * use this function, either treat the slices as read-only, or assume the sliced matrix contains
+ * garbage after modifying the data in one of the slices.
+ *
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @param in    Old matrix (with foreign data) to re-use. Passing NULL will allocate a new matrix.
+ * @return The resulting slice matrix, or NULL if out of memory
+ */
+dl_matrix2dq_t *dl_matrixq_slice(const dl_matrix2dq_t *src, int x, int y, int w, int h, dl_matrix2dq_t *in);
+
+/**
+ * @brief select a range of items from an existing matrix and flatten them into one dimension.
+ *
+ * @Warning The results are flattened in row-major order.
+ *   
+ * @param x     X-offset of the origin of the returned matrix within the sliced matrix
+ * @param y     Y-offset of the origin of the returned matrix within the sliced matrix
+ * @param w     Width of the resulting matrix
+ * @param h     Height of the resulting matrix
+ * @param in    Old matrix to re-use. Passing NULL will allocate a new matrix.
+ * @return The resulting flatten matrix, or NULL if out of memory
+ */
+dl_matrix2dq_t *dl_matrixq_flatten(const dl_matrix2dq_t *src, int x, int y, int w, int h, dl_matrix2dq_t *in);
+
+/**
+ * @brief Subtract a quantized matrix from another, item-by-item: res=a-b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Subtracted data. Can be equal to a or b to overwrite that.
+ * @param shift Shift value. Only 0 or 1 makes sense here. <ToDo: check>
+ */
+void dl_matrixq_sub(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *res, int shift);
+
+/**
+ * @brief Multiply a pair of quantized matrices item-by-item: res=a*b
+ *
+ * @param a     First multiplicand
+ * @param b     Second multiplicand
+ * @param res   Multiplicated data. Can be equal to a or b to overwrite that matrix.
+ */
+void dl_matrixq_mul( dl_matrix2dq_t *a,  dl_matrix2dq_t *b, dl_matrix2dq_t *res);
+
+/**
+ * @brief Divide a pair of quantized matrices item-by-item: res=a/b
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @param res   Divided data. Can be equal to a or b to overwrite that.
+ */
+void dl_matrixq_div(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b, dl_matrix2dq_t *out, int shift);
+
+/**
+ * @brief Check if two quantized matrices have the same shape, that is, the same amount of 
+ * rows and columns
+ *
+ * @param a     First of the two matrices to compare
+ * @param b     Second of the two matrices to compare
+ * @return true if the two matrices are shaped the same, false otherwise.
+ */
+int dl_matrixq_same_shape(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b);
+
+/**
+ * @brief Concatenate the rows of two quantized matrices into a new matrix
+ *
+ * @param a     First matrix
+ * @param b     Second matrix
+ * @return A newly allocated quantized matrix with as values a|b
+ */
+dl_matrix2dq_t *dl_matrixq_concat(const dl_matrix2dq_t *a, const dl_matrix2dq_t *b);
+
+/**
+ * @brief Add a constant to every item of the quantized matrix
+ *
+ * @param subj  Matrix to add the constant to
+ * @param add   The constant
+ */
+void dl_matrixq_add_const(dl_matrix2dq_t *subj, const fptp_t add, int shift);
+
+/**
+ * @brief Check the sanity of a quantized matrix
+ *
+ * Due to the nature of quantized matrices, depending on the calculations a quantized
+ * matrix is the result of and the shift values chosen in those calculations, a quantized
+ * matrix may have an exponent and mantissas that lead to a loss of precision, either because
+ * most significant mantissa bits are unused, or because a fair amount of mantissas are 
+ * clipped. This function checks if this is the case and will report a message to stdout
+ * if significant loss of precision is detected.
+ *
+ * @param m     The quantized matrix to check
+ * @param name  A string to be displayed in the message if the sanity check fails
+ * @return True if matrix is sane, false otherwise
+ **/
+
+int dl_matrixq_check_sanity(dl_matrix2dq_t *m, const char *name);
+
+/**
+ * @brief re-adjust the exponent of the matrix to fit the mantissa better
+ *
+ * This function will shift up all the data in the mantissas so there are no
+ * most-significant bits that are unused in all mantissas. It will also adjust
+ * the exponent to keep the actua values in the matrix the same.
+ *
+ * Some operations done on a matrix, especially operations that re-use the
+ * result of earlier operations done in the same way, can lead to the loss of
+ * data because the exponent of the quantized matrix is never re-adjusted. You
+ * can do that implicitely by calling this function.
+ *
+ * @param m     The matrix to re-adjust
+**/
+void dl_matrixq_readjust_exp(dl_matrix2dq_t *m);
+
+
+
+/**
+ * @brief Get the floating-point value of a specific item from the quantized matrix
+ *
+ * @param m     Matrix to access
+ * @param x     Column address
+ * @param y     Row address
+ * @return Value in that position
+ */
+fptp_t dl_matrixq_get(const dl_matrix2dq_t *m, const int x, const int y);
+
+/**
+ * @brief Set a specific item in the quantized matrix to the given 
+ * floating-point value
+ *
+ * @warning If the given value is more than the exponent in the quantized matrix
+ * allows for, all mantissas in the matrix will be shifted down to make the value
+ * 'fit'. If, however, the exponent is such that the value would result in a
+ * quantized mantissa of 0, nothing is done.
+ *
+ * @param m     Matrix to access
+ * @param x     Column address
+ * @param y     Row address
+ * @param val   Value to write to that position
+ */
+void dl_matrixq_set(dl_matrix2dq_t *m, const int x, const int y, fptp_t val);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/include/esp32s2/dl_lib_matrixq8.h b/include/esp32s2/dl_lib_matrixq8.h
new file mode 100644
index 0000000..377df7c
--- /dev/null
+++ b/include/esp32s2/dl_lib_matrixq8.h
@@ -0,0 +1,80 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#ifndef DL_LIB_MATRIXQ8_H
+#define DL_LIB_MATRIXQ8_H
+
+#include <stdint.h>
+#include "dl_lib_matrix.h"
+#include "dl_lib.h"
+#include "dl_lib_matrixq.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef int8_t q8tp_t;
+
+typedef struct {
+    int w;
+    int h;
+    int stride; //Normally equals h, not w!
+    int flags;
+    int exponent; //The values in items should be multiplied by pow(2,exponent) to get the real values.
+    q8tp_t *itemq;
+} dl_matrix2dq8_t;
+
+#define DL_Q8TP_SHIFT 7
+#define DL_Q8TP_RANGE ((1<<DL_Q8TP_SHIFT)-1)
+#define DL_ITMQ8(m, x, y) m->itemq[(y)+(x)*m->stride]
+
+/**
+ * @brief Allocate a matrix
+ *
+ * @param w     Width of the matrix
+ * @param h     Height of the matrix
+ * @return The matrix, or NULL if out of memory
+ */
+dl_matrix2dq8_t *dl_matrixq8_alloc(int w, int h);
+
+/**
+ * @brief Free a quantized matrix
+ * Frees the matrix structure and (if it doesn't have the DL_MF_FOREIGNDATA flag set) the m->items space as well.
+ *
+ * @param m     Matrix to free
+ */
+void dl_matrixq8_free(dl_matrix2dq8_t *m);
+
+/**
+ * @brief Copy a quantized matrix
+ * Copy a quantized matrix from flash or iram/psram
+ *
+ * @param m     Matrix to copy
+ */
+dl_matrix2dq8_t *dl_matrixq8_copy_to_psram(const dl_matrix2dq8_t *m);
+
+/**
+ * @brief Convert a floating-point matrix to a quantized matrix
+ *
+ * @param m     Floating-point matrix to convert
+ * @param out   Quantized matrix to re-use. If NULL, allocate a new one.
+ * @Return The quantized version of the floating-point matrix
+ */
+dl_matrix2dq8_t *dl_matrixq8_from_matrix2d(const dl_matrix2d_t *m, dl_matrix2dq8_t *out);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+
diff --git a/include/esp32s2/esp_aec.h b/include/esp32s2/esp_aec.h
new file mode 100644
index 0000000..36de9c1
--- /dev/null
+++ b/include/esp32s2/esp_aec.h
@@ -0,0 +1,105 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_AEC_H_
+#define _ESP_AEC_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define USE_AEC_FFT                      // Not kiss_fft
+#define AEC_SAMPLE_RATE     16000        // Only Support 16000Hz
+#define AEC_FRAME_LENGTH_MS 32
+
+typedef struct aec_handle_t aec_handle_t;
+typedef enum {
+    AEC_MODE_SR_LOW_COST = 0,     // Low Cost AEC fro speech recognition
+    AEC_MODE_SR_HIGH_PERF = 1,    // High Perforamce AEC for speech recognition
+    AEC_MODE_VOIP_LOW_COST = 3,   // Low Cost AEC for voice communication
+    AEC_MODE_VOIP_HIGH_PERF = 4,  // High Perforamce AEC for voice communication
+} aec_mode_t;
+
+/**
+ * @brief Creates an instance to the AEC structure.
+ * Please get frame size by aec_get_chunksize() function
+ * 
+ * @param sample_rate       The Sampling frequency (Hz) must be 16000.
+ * @param filter_length     Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
+ * @param channel_num       The input microphone channel number
+ * @param mode              The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of AEC
+ */
+aec_handle_t *aec_create(int sample_rate, int filter_length, int channel_num, aec_mode_t mode);
+
+/**
+ * @brief Creates an instance to the AEC structure, same with aec_create().
+ * 
+ * @param filter_length     Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
+ * @param channel_num       The input microphone channel number
+ * @param mode              The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of AEC
+ */
+aec_handle_t *aec_pro_create(int filter_length, int channel_num, aec_mode_t mode);
+
+/**
+ * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
+ *
+ * @warning The indata, refdata and outdata must be 16-bit signed. please allocate memory by heap_caps_aligned_alloc().
+ * 
+ * @param inst        The instance of AEC. Format for multi-channel data is "ch0 ch0 ch0 ..., ch1 ch1 ch1 ..."
+ * @param indata      An array of 16-bit signed audio samples from mic.
+ * @param refdata     An array of 16-bit signed audio samples sent to the speaker.
+ * @param outdata     Returns near-end signal with echo removed. Format for multi-channel data is "ch0 ch0 ch0..., ch1 ch1 ch1 ..."
+ * @return None
+ *
+ */
+void aec_process(const aec_handle_t *handel, int16_t *indata, int16_t *refdata, int16_t *outdata);
+
+/**
+ * @brief Get frame size of AEC (the samples of one frame)
+ * @param handle The instance of AEC.
+ * @return Frame size
+ */
+int aec_get_chunksize(const aec_handle_t *handle);
+
+/**
+ * @brief Get AEC mode string 
+ * 
+ * @param aec_mode  The mode of AEC.
+ * 
+ * @return AEC mode string
+ */
+char * aec_get_mode_string(aec_mode_t aec_mode);
+
+/**
+ * @brief Free the AEC instance
+ *
+ * @param inst The instance of AEC.
+ *
+ * @return None
+ *
+ */
+void aec_destroy(aec_handle_t *handel);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_AEC_H_
diff --git a/include/esp32s2/esp_afe_aec.h b/include/esp32s2/esp_afe_aec.h
new file mode 100644
index 0000000..9d60588
--- /dev/null
+++ b/include/esp32s2/esp_afe_aec.h
@@ -0,0 +1,82 @@
+
+#ifndef _ESP_AFE_AEC_H_
+#define _ESP_AFE_AEC_H_
+
+
+#include "esp_afe_config.h"
+#include "esp_aec.h"
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    aec_handle_t* handle;
+    aec_mode_t mode;
+    afe_pcm_config_t pcm_config;
+    int frame_size;
+    int16_t  *data;
+}afe_aec_handle_t;
+
+
+/**
+ * @brief Creates an instance to the AEC structure. 
+ * 
+ * @warning Currently only support 1 microphone channel and 1 playback channe. 
+ * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
+ *
+ * The input format, same as afe config:
+ * M to represent the microphone channel
+ * R to represent the playback reference channel
+ * N to represent an unknown or unused channel
+ *
+ * For example, input_format="MMNR" indicates that the input data consists of four channels,
+ * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
+ *
+ * @param input_format     The input format
+ * @param filter_length    The length of filter. The larger the filter, the higher the CPU loading.
+ *                         Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
+ * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
+ * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+ *
+ * @return afe_config_t*  The default config of afe
+ */
+afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);
+
+
+/**
+ * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
+ * 
+ * @param inst        The instance of AEC.
+ * @param indata      Input audio data, format is define by input_format. Note indata will be modified in function call.
+ * @param outdata     Returns near-end signal with echo removed. 
+
+ * @return The bytes of outdata.
+ */
+size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
+
+/**
+ * @brief Get frame size of AEC (the samples of one frame)
+ * @param handle The instance of AEC.
+ * @return Frame size
+ */
+int afe_aec_get_chunksize(afe_aec_handle_t *handle);
+
+
+/**
+ * @brief Free the AEC instance
+ *
+ * @param inst The instance of AEC.
+ *
+ * @return None
+ *
+ */
+void afe_aec_destroy(afe_aec_handle_t *handel);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_AEC_H_
diff --git a/include/esp32s2/esp_afe_config.h b/include/esp32s2/esp_afe_config.h
new file mode 100644
index 0000000..f9de6fe
--- /dev/null
+++ b/include/esp32s2/esp_afe_config.h
@@ -0,0 +1,69 @@
+#pragma once
+#include "esp_aec.h"
+#include "stdbool.h"
+#include "stdint.h"
+#include "stdlib.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// AFE: Audio Front-End
+// SR:  Speech Recognition
+// VC:  Voice Communication
+
+// Set AFE_SR mode
+typedef enum {
+    SR_MODE_LOW_COST = 0,  // Deprecated, please use afe_mode_t, AFE mode: low cost mode
+    SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode
+} afe_sr_mode_t;
+
+// Set AFE mode
+typedef enum {
+    AFE_MODE_LOW_COST = 0,  // AFE mode: low cost mode
+    AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
+} afe_mode_t;
+
+// Set AFE type
+typedef enum {
+    AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
+    AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
+} afe_type_t;
+
+typedef enum {
+    AFE_MEMORY_ALLOC_MORE_INTERNAL = 1,          // malloc with more internal ram
+    AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
+    AFE_MEMORY_ALLOC_MORE_PSRAM = 3              // malloc with more psram
+} afe_memory_alloc_mode_t;
+
+typedef enum {
+    AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
+    AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
+    AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
+    AFE_MN_PEAK_NO_AGC = 0,      // There is no agc gain
+} afe_mn_peak_agc_mode_t;
+
+typedef struct {
+    int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
+    int mic_num;      // microphone channel number
+    uint8_t *mic_ids; // microphone channel indices
+    int ref_num;      // playback reference channel number
+    uint8_t *ref_ids; // playback reference channel indices
+    int sample_rate;  // sample rate of audio
+} afe_pcm_config_t;
+
+typedef enum {
+    AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
+    AFE_NS_MODE_NET = 1,    // please use model name of NSNET
+} afe_ns_mode_t;
+
+typedef enum {
+    AFE_AGC_MODE_WEBRTC = 0,  // WEBRTC AGC
+    AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
+} afe_agc_mode_t;
+
+
+#ifdef __cplusplus
+}
+#endif
+
diff --git a/include/esp32s2/esp_mfcc_fbank_int16.h b/include/esp32s2/esp_mfcc_fbank_int16.h
new file mode 100644
index 0000000..22a5f2c
--- /dev/null
+++ b/include/esp32s2/esp_mfcc_fbank_int16.h
@@ -0,0 +1,86 @@
+#pragma once
+#include "esp_speech_features.h"
+#include <stdint.h>
+
+/*
+This describes an interface for a MFCC runner, that is, some kind of implementation that can be
+fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so
+multiple implementations can be used.
+*/
+
+typedef struct esp_mfcc_data_t esp_mfcc_data_t;
+
+// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features),
+// please refer to its documentation for details.
+typedef struct {
+    int winstep_ms; // The step between successive windows in ms. (10)
+    int winlen_ms;  // The length of the analysis window in ms. (25)
+    int nch;        // The number of input channel
+    int numcep;     // The number of cepstrum to return
+    int nfilter;    // The number of filters in the filterbank
+    int nfft;       // The FFT size
+    int samp_freq;  // The sample-rate of the signal.
+    int low_freq;   // The lowest band edge of mel filters, in hz. (e.g. 0)
+    int high_freq;  // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
+    float preemph;  // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
+    char *win_type; // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
+    bool append_energy; // 　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
+    bool use_power;     // If true, use power of fft spectrum, else use magnitude of fft spectrum
+    int use_log_fbank;  // 0: return fbank, 1:  return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
+    float log_epsilon;  // log epsilon. (e.g. 1e-7)
+    bool psram_first;   // Alloc memory from PSRAM first
+    bool remove_dc_offset; // Whether to subtract mean of wave before FFT
+} esp_mfcc_opts_t;
+
+/**
+ * @brief Un-initialize and free a mfcc runner
+ *
+ * Function to free a previously allocated mfcc runner.
+ *
+ * @param r Runner object to destroy
+ */
+typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Initialize parameters for a mfcc runner.
+ *
+ * After creation, a mfcc runner needs to be initialized first; this is usually done
+ * in the initialization routine of a speech recognition algorithm. This provides
+ * a pointer to do this for a specific mfcc runner.
+ *
+ * @param opt Options for the mfcc process
+ * @return True if success, false on error.
+ */
+typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
+
+/**
+ * @brief Run a mfcc iteration on frame by frame
+ *
+ * This will take a set of samples and return a ceptrum. Note that this may be pipelined:
+ * an initial call to this function may return NULL and subsequent calls may return the
+ * cepstrum of previous calls.
+ *
+ * @param r The mfcc runner
+ * @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000).
+ * @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function
+ *         when done with this buffer. Note that some implementations require the buffer to be freed before another call
+ *         to this function is done.
+ */
+typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
+
+/**
+ * @brief Clean all state of mfcc handle
+ *
+ * @param r The mfcc runner
+ */
+typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Operations possible on a mfcc runner
+ */
+typedef struct {
+    esp_mfcc_op_destroy_t destroy;
+    esp_mfcc_op_create_t create;
+    esp_mfcc_op_run_step_t run_step;
+    esp_mfcc_op_clean_t clean;
+} esp_mfcc_iface_t;
diff --git a/include/esp32s2/esp_mfcc_iface.h b/include/esp32s2/esp_mfcc_iface.h
new file mode 100644
index 0000000..0257768
--- /dev/null
+++ b/include/esp32s2/esp_mfcc_iface.h
@@ -0,0 +1,89 @@
+#pragma once
+#include "esp_speech_features.h"
+#include <stdint.h>
+
+/*
+This describes an interface for a MFCC runner, that is, some kind of implementation that can be
+fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so
+multiple implementations can be used.
+*/
+
+typedef struct esp_mfcc_data_t esp_mfcc_data_t;
+
+// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features),
+// please refer to its documentation for details.
+typedef struct {
+    int winstep_ms; // The step between successive windows in ms. (10)
+    int winlen_ms;  // The length of the analysis window in ms. (25)
+    int nch;        // The number of input channel
+    int numcep;     // The number of cepstrum to return
+    int nfilter;    // The number of filters in the filterbank
+    int nfft;       // The FFT size
+    int samp_freq;  // The sample-rate of the signal.
+    int low_freq;   // The lowest band edge of mel filters, in hz. (e.g. 0)
+    int high_freq;  // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
+    float preemph;  // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
+    char *win_type; // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
+    bool append_energy; // 　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
+    bool use_power;     // If true, use power of fft spectrum, else use magnitude of fft spectrum
+    int use_log_fbank;  // 0: return fbank, 1:  return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
+    float log_epsilon;  // log epsilon. (e.g. 1e-7)
+    bool psram_first;   // Alloc memory from PSRAM first
+    bool remove_dc_offset; // Whether to subtract mean of wave before FFT
+} esp_mfcc_opts_t;
+
+/**
+ * @brief Un-initialize and free a mfcc runner
+ *
+ * Function to free a previously allocated mfcc runner.
+ *
+ * @param r Runner object to destroy
+ */
+typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Initialize parameters for a mfcc runner.
+ *
+ * After creation, a mfcc runner needs to be initialized first; this is usually done
+ * in the initialization routine of a speech recognition algorithm. This provides
+ * a pointer to do this for a specific mfcc runner.
+ *
+ * @param opt Options for the mfcc process
+ * @return True if success, false on error.
+ */
+typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
+
+/**
+ * @brief Run a mfcc iteration on frame by frame
+ *
+ * This will take a set of samples and return a ceptrum. Note that this may be pipelined:
+ * an initial call to this function may return NULL and subsequent calls may return the
+ * cepstrum of previous calls.
+ *
+ * @param r The mfcc runner
+ * @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000).
+ * @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function
+ *         when done with this buffer. Note that some implementations require the buffer to be freed before another call
+ *         to this function is done.
+ */
+typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
+
+typedef void (*esp_mfcc_op_run_step_s16_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t *fbank);
+
+/**
+ * @brief Clean all state of mfcc handle
+ *
+ * @param r The mfcc runner
+ */
+typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Operations possible on a mfcc runner
+ */
+typedef struct {
+    esp_mfcc_op_destroy_t destroy;
+    esp_mfcc_op_create_t create;
+    esp_mfcc_op_run_step_t run_step;
+    esp_mfcc_op_run_step_s16_t run_step_s16;
+    esp_mfcc_op_clean_t clean;
+} esp_mfcc_iface_t;
diff --git a/include/esp32s2/esp_mfcc_models.h b/include/esp32s2/esp_mfcc_models.h
new file mode 100644
index 0000000..44086e8
--- /dev/null
+++ b/include/esp32s2/esp_mfcc_models.h
@@ -0,0 +1,44 @@
+#pragma once
+#include "esp_mfcc_iface.h"
+
+extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle
+extern const esp_mfcc_iface_t esp_fbank_s16; // int16-fbank handle
+
+/**
+ * @brief Return basic opts used in wakenet9 & multinet5
+ **/
+esp_mfcc_opts_t *get_mfcc_opts_wn9();
+
+/**
+ * @brief Return basic opts used in wakenet9s
+ **/
+esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
+
+/**
+ * @brief Return basic opts for default kaldifeat
+ *
+    opts->psram_first = true;
+    opts->use_power = true;
+    opts->use_log_fbank = 2;  // log(max(x, log_epsilon))
+    opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps
+    opts->win_type = "povey";
+    opts->low_freq = 20;
+    opts->high_freq = 7600;
+    opts->samp_freq = 16000;
+    opts->nch = 1;
+    opts->nfft = 512;
+    opts->nfilter = 80;
+    opts->numcep = 80;
+    opts->preemph = 0.97;
+    opts->append_energy = false;
+    opts->winlen_ms = 25;
+    opts->winstep_ms = 10;
+    opts->remove_dc_offset = true;
+ *
+ **/
+esp_mfcc_opts_t *get_mfcc_opts_kaldi();
+
+/**
+ * @brief Print mfcc opts
+ **/
+void print_mfcc_opts(esp_mfcc_opts_t *opts);
diff --git a/include/esp32s2/esp_speech_features.h b/include/esp32s2/esp_speech_features.h
new file mode 100644
index 0000000..c1659f9
--- /dev/null
+++ b/include/esp32s2/esp_speech_features.h
@@ -0,0 +1,62 @@
+#pragma once
+#include "c_speech_features_config.h"
+#include "stdlib.h"
+#include <assert.h>
+#include <stdbool.h>
+
+#ifndef M_2PI
+#define M_2PI 6.283185307179586476925286766559005
+#endif
+
+typedef struct {
+    float *coeff;
+    int *bank_pos;
+    int nfilter;
+} esp_mel_filter_t;
+
+float *esp_mfcc_malloc(size_t size, bool from_psram);
+
+void esp_mfcc_free(void *ptr);
+
+/**
+ * @brief Initialize FFT table
+ * @warning For ESP-PLATFORM, use esp-dsp fft
+ *          For Other platform, use kiss fft
+ *
+ * @param nfft  The input samples number
+ * @return fft-table
+ **/
+void *esp_fft_init(int nfft);
+
+/**
+ * @brief Free FFT table
+ * @warning For ESP-PLATFORM, use esp-dsp fft
+ *          For Other platform, use kiss fft
+ *
+ * @param fft_table  The fft table initialized by esp_fft_init
+ * @param nfft       The input samples number
+ * @return fft-table
+ **/
+void esp_fft_deinit(void *fft_table, int nfft);
+
+/**
+ * @brief Initial window function
+ *        Currently support hanning, hamming, sine, povey, rectangular,
+ *        wn9(512-hanning to get wakenet9& multinet5 compatible)
+ **/
+float *esp_win_func_init(char *win_type, float *window_data, int frame_length);
+
+float *esp_fftr(float *x, int nfft, void *fft_table);
+
+float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
+
+void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
+
+float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);
+
+esp_mel_filter_t *esp_mel_filter_init(
+    int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, bool from_psram);
+
+void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);
+
+float *esp_mel_dotprod_step(float *x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, float epsilon);
diff --git a/include/esp32s2/esp_wn_iface.h b/include/esp32s2/esp_wn_iface.h
new file mode 100644
index 0000000..44bab8d
--- /dev/null
+++ b/include/esp32s2/esp_wn_iface.h
@@ -0,0 +1,215 @@
+#pragma once
+#include "stdint.h"
+#include "dl_lib_convq_queue.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+//Opaque model data container
+typedef struct model_iface_data_t model_iface_data_t;
+
+/**
+ * @brief The state of wakeup
+ */
+typedef enum
+{
+    WAKENET_NO_DETECT = 0,               // wake word is not detected
+    WAKENET_CHANNEL_VERIFIED = -1,       // output channel is verified
+    WAKENET_DETECTED = 1                 // wake word is detected
+} wakenet_state_t;
+
+//Set wake words recognition operating mode
+//The probability of being wake words is increased with increasing mode, 
+//As a consequence also the false alarm rate goes up
+typedef enum {
+	DET_MODE_90 = 0,       // Normal
+	DET_MODE_95 = 1,       // Aggressive
+    DET_MODE_2CH_90 = 2,
+    DET_MODE_2CH_95 = 3,
+    DET_MODE_3CH_90 = 4,
+    DET_MODE_3CH_95 = 5,
+} det_mode_t;
+
+typedef struct {
+    int wake_word_num;     //The number of all wake words
+    char **wake_word_list; //The name list of wake words  
+} wake_word_info_t;
+
+/**
+ * @brief Easy function type to initialze a model instance with a detection mode and specified wake word coefficient
+ *
+ * @param model_name  The specified wake word model coefficient
+ * @param det_mode    The wake words detection mode to trigger wake words, DET_MODE_90 or DET_MODE_95
+ * @returns Handle to the model data
+ */
+typedef model_iface_data_t* (*esp_wn_iface_op_create_t)(const void *model_name, det_mode_t det_mode);
+
+/**
+ * @brief Get the amount of samples that need to be passed to the detect function
+ *
+ * Every speech recognition model processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_wn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the channel number of samples that need to be passed to the detect function
+ *
+ * Every speech recognition model processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_wn_iface_op_get_channel_num_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the start point of wake word when one wake word is detected. 
+ * 
+ * @Warning: This function should be called when the channel index is verified. 
+ * The returned value is the number of samples from start point of wake word to detected point. 
+ * 
+ * @param model The model object to query
+ * @return The number of samples from start point to detected point (end point)
+ */
+typedef int (*esp_wn_iface_op_get_start_point_t)(model_iface_data_t *model);
+
+
+/**
+ * @brief Get the sample rate of the samples to feed to the detect function
+ *
+ * @param model The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_wn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the number of wake words
+ *
+ * @param model The model object to query
+ * @returns the number of wake words
+ */
+typedef int (*esp_wn_iface_op_get_word_num_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the name of wake word by index
+ *
+ * @Warning The index of wake word start with 1
+
+ * @param model The model object to query
+ * @param word_index The index of wake word
+ * @returns the detection threshold
+ */
+typedef char* (*esp_wn_iface_op_get_word_name_t)(model_iface_data_t *model, int word_index);
+
+/**
+ * @brief Set the detection threshold to manually abjust the probability 
+ *
+ * @param model The model object to query
+ * @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.5~0.9999
+ * @param word_index The index of wake word
+ * @return 0: setting failed, 1: setting success
+ */
+typedef int (*esp_wn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold, int word_index);
+
+/**
+ * @brief Get the wake word detection threshold of different modes
+ *
+ * @param model The model object to query
+ * @param word_index The index of wake word
+ * @returns the detection threshold
+ */
+typedef float (*esp_wn_iface_op_get_det_threshold_t)(model_iface_data_t *model, int word_index);
+
+/**
+ * @brief Feed samples of an audio stream to the keyword detection model and detect if there is a keyword found.
+ *
+ * @Warning The index of wake word start with 1, 0 means no wake words is detected.
+ *
+ * @param model The model object to query
+ * @param samples An array of 16-bit signed audio samples. The array size used can be queried by the 
+ *        get_samp_chunksize function.
+ * @return The index of wake words, return 0 if no wake word is detected, else the index of the wake words.
+ */
+typedef wakenet_state_t (*esp_wn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
+
+/**
+ * @brief Get the volume gain
+ *
+ * @param model The model object to query
+ * @param target_db  The target dB to calculate volume gain
+ * @returns the volume gain
+ */
+typedef float (*esp_wn_iface_op_get_vol_gain_t)(model_iface_data_t *model, float target_db);
+
+/**
+ * @brief Get the triggered channel index. Channel index starts from zero
+ *
+ * @param model The model object to query
+ * @return The channel index
+ */
+typedef int (*esp_wn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
+
+/**
+ * @brief Clean all states of model
+ *
+ * @param model The model object to query
+ */
+typedef void (*esp_wn_iface_op_clean_t)(model_iface_data_t *model);
+
+/**
+ * @brief Destroy a speech recognition model
+ *
+ * @param model Model object to destroy
+ */
+typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model);
+
+/**
+ * @brief Feed MFCC of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param cq An array of 16-bit MFCC.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef wakenet_state_t (*esp_wn_iface_op_detect_mfcc_t)(model_iface_data_t *model, int16_t *samples, dl_convq_queue_t *cq);
+
+/**
+ * @brief Get MFCC of an audio stream
+ *
+ * @param model The model object to query
+ * @return MFCC data
+ */
+typedef dl_convq_queue_t* (*esp_wn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
+
+
+/**
+ * This structure contains the functions used to do operations on a wake word detection model.
+ */
+typedef struct {
+    esp_wn_iface_op_create_t create;
+    esp_wn_iface_op_get_start_point_t get_start_point;
+    esp_wn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_wn_iface_op_get_channel_num_t get_channel_num;
+    esp_wn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_wn_iface_op_get_word_num_t get_word_num;
+    esp_wn_iface_op_get_word_name_t get_word_name;
+    esp_wn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_wn_iface_op_get_det_threshold_t get_det_threshold;
+    esp_wn_iface_op_get_triggered_channel_t  get_triggered_channel;
+    esp_wn_iface_op_get_vol_gain_t get_vol_gain;
+    esp_wn_iface_op_detect_t detect;
+    esp_wn_iface_op_detect_mfcc_t detect_mfcc;
+    esp_wn_iface_op_get_mfcc_data_t get_mfcc_data;
+    esp_wn_iface_op_clean_t clean;
+    esp_wn_iface_op_destroy_t destroy;
+} esp_wn_iface_t;
+
+#ifdef __cplusplus
+}
+#endif
\ No newline at end of file
diff --git a/include/esp32s2/esp_wn_models.h b/include/esp32s2/esp_wn_models.h
new file mode 100644
index 0000000..3a4d7e4
--- /dev/null
+++ b/include/esp32s2/esp_wn_models.h
@@ -0,0 +1,52 @@
+#pragma once
+#include "esp_wn_iface.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The prefix of wakenet model name is used to filter all wakenet from availabel models.
+#define ESP_WN_PREFIX "wn"
+
+/**
+ * @brief Get the wakenet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of wakenet
+ */
+const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
+
+/**
+ * @brief Get the wake word name from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The wake word name, like "alexa","hilexin","xiaoaitongxue"
+ */
+char* esp_wn_wakeword_from_name(const char *model_name);
+
+#ifdef __cplusplus
+}
+#endif
+
+/*
+
+static const sr_model_iface_t *model = esp_wn_handle_from_name(model_name);
+
+//Initialize wakeNet model data
+static model_iface_data_t *model_data=model->create(model_name, DET_MODE_90);
+
+//Set parameters of buffer
+int audio_chunksize=model->get_samp_chunksize(model_data);
+int frequency = model->get_samp_rate(model_data);
+int16_t *buffer=malloc(audio_chunksize*sizeof(int16_t));
+
+//Detect
+int r=model->detect(model_data, buffer);
+if (r>0) {
+    printf("Detection triggered output %d.\n",  r);
+}
+
+//Destroy model
+model->destroy(model_data)
+
+*/
diff --git a/lib/esp32c3/libc_speech_features.a b/lib/esp32c3/libc_speech_features.a
new file mode 100644
index 0000000..28679c7
Binary files /dev/null and b/lib/esp32c3/libc_speech_features.a differ
diff --git a/lib/esp32c3/libdl_lib.a b/lib/esp32c3/libdl_lib.a
new file mode 100644
index 0000000..19a5e83
Binary files /dev/null and b/lib/esp32c3/libdl_lib.a differ
diff --git a/lib/esp32c3/libesp_audio_front_end.a b/lib/esp32c3/libesp_audio_front_end.a
new file mode 100644
index 0000000..4c99de1
Binary files /dev/null and b/lib/esp32c3/libesp_audio_front_end.a differ
diff --git a/lib/esp32c3/libesp_audio_processor.a b/lib/esp32c3/libesp_audio_processor.a
new file mode 100644
index 0000000..3b8b36c
Binary files /dev/null and b/lib/esp32c3/libesp_audio_processor.a differ
diff --git a/lib/esp32c3/libhufzip.a b/lib/esp32c3/libhufzip.a
new file mode 100644
index 0000000..4ec56b8
Binary files /dev/null and b/lib/esp32c3/libhufzip.a differ
diff --git a/lib/esp32c3/libwakenet.a b/lib/esp32c3/libwakenet.a
new file mode 100644
index 0000000..8955079
Binary files /dev/null and b/lib/esp32c3/libwakenet.a differ
diff --git a/lib/esp32c6/libc_speech_features.a b/lib/esp32c6/libc_speech_features.a
new file mode 100644
index 0000000..de94ea0
Binary files /dev/null and b/lib/esp32c6/libc_speech_features.a differ
diff --git a/lib/esp32c6/libdl_lib.a b/lib/esp32c6/libdl_lib.a
new file mode 100644
index 0000000..0f32a4b
Binary files /dev/null and b/lib/esp32c6/libdl_lib.a differ
diff --git a/lib/esp32c6/libesp_audio_front_end.a b/lib/esp32c6/libesp_audio_front_end.a
new file mode 100644
index 0000000..d92b830
Binary files /dev/null and b/lib/esp32c6/libesp_audio_front_end.a differ
diff --git a/lib/esp32c6/libesp_audio_processor.a b/lib/esp32c6/libesp_audio_processor.a
new file mode 100644
index 0000000..84192c9
Binary files /dev/null and b/lib/esp32c6/libesp_audio_processor.a differ
diff --git a/lib/esp32c6/libhufzip.a b/lib/esp32c6/libhufzip.a
new file mode 100644
index 0000000..1a4f151
Binary files /dev/null and b/lib/esp32c6/libhufzip.a differ
diff --git a/lib/esp32c6/libwakenet.a b/lib/esp32c6/libwakenet.a
new file mode 100644
index 0000000..c3413cf
Binary files /dev/null and b/lib/esp32c6/libwakenet.a differ
diff --git a/lib/esp32s2/libc_speech_features.a b/lib/esp32s2/libc_speech_features.a
new file mode 100644
index 0000000..2fe5691
Binary files /dev/null and b/lib/esp32s2/libc_speech_features.a differ
diff --git a/lib/esp32s2/libdl_lib.a b/lib/esp32s2/libdl_lib.a
new file mode 100644
index 0000000..9cd8d80
Binary files /dev/null and b/lib/esp32s2/libdl_lib.a differ
diff --git a/lib/esp32s2/libesp_audio_front_end.a b/lib/esp32s2/libesp_audio_front_end.a
new file mode 100644
index 0000000..2404377
Binary files /dev/null and b/lib/esp32s2/libesp_audio_front_end.a differ
diff --git a/lib/esp32s2/libesp_audio_processor.a b/lib/esp32s2/libesp_audio_processor.a
new file mode 100644
index 0000000..22c0987
Binary files /dev/null and b/lib/esp32s2/libesp_audio_processor.a differ
diff --git a/lib/esp32s2/libhufzip.a b/lib/esp32s2/libhufzip.a
new file mode 100644
index 0000000..694705f
Binary files /dev/null and b/lib/esp32s2/libhufzip.a differ
diff --git a/lib/esp32s2/libwakenet.a b/lib/esp32s2/libwakenet.a
new file mode 100644
index 0000000..9045c44
Binary files /dev/null and b/lib/esp32s2/libwakenet.a differ
diff --git a/test_apps/esp-tts/sdkconfig.ci.esp32c5 b/test_apps/esp-tts/sdkconfig.ci.esp32c5
new file mode 100644
index 0000000..ab64f22
--- /dev/null
+++ b/test_apps/esp-tts/sdkconfig.ci.esp32c5
@@ -0,0 +1,11 @@
+# This file was generated using idf.py save-defconfig. It can be edited manually.
+# Espressif IoT Development Framework (ESP-IDF) 5.4.1 Project Minimal Configuration
+#
+CONFIG_IDF_TARGET="esp32c5"
+CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
+CONFIG_ESPTOOLPY_FLASHSIZE_8MB=y
+CONFIG_PARTITION_TABLE_CUSTOM=y
+CONFIG_SPIRAM=y
+CONFIG_ESP_SYSTEM_ALLOW_RTC_FAST_MEM_AS_HEAP=n
+CONFIG_ESP_INT_WDT=n
+CONFIG_ESP_TASK_WDT_EN=n
diff --git a/test_apps/esp32c5/sdkconfig.ci.esp32c3 b/test_apps/esp32c5/sdkconfig.ci.esp32c3
new file mode 100644
index 0000000..f193d7a
--- /dev/null
+++ b/test_apps/esp32c5/sdkconfig.ci.esp32c3
@@ -0,0 +1,11 @@
+# This file was generated using idf.py save-defconfig. It can be edited manually.
+# Espressif IoT Development Framework (ESP-IDF) 5.4.1 Project Minimal Configuration
+#
+CONFIG_IDF_TARGET="esp32c3"
+CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
+CONFIG_ESPTOOLPY_FLASHSIZE_4MB=y
+CONFIG_PARTITION_TABLE_CUSTOM=y
+CONFIG_SR_WN_WN9S_HIESP=y
+CONFIG_ESP_MAIN_TASK_STACK_SIZE=148584
+CONFIG_ESP_TASK_WDT_EN=n
+CONFIG_IDF_EXPERIMENTAL_FEATURES=y
diff --git a/test_apps/esp32c5/sdkconfig.ci.esp32c6 b/test_apps/esp32c5/sdkconfig.ci.esp32c6
new file mode 100644
index 0000000..e672fe8
--- /dev/null
+++ b/test_apps/esp32c5/sdkconfig.ci.esp32c6
@@ -0,0 +1,11 @@
+# This file was generated using idf.py save-defconfig. It can be edited manually.
+# Espressif IoT Development Framework (ESP-IDF) 5.4.1 Project Minimal Configuration
+#
+CONFIG_IDF_TARGET="esp32c6"
+CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
+CONFIG_ESPTOOLPY_FLASHSIZE_4MB=y
+CONFIG_PARTITION_TABLE_CUSTOM=y
+CONFIG_SR_WN_WN9S_HIESP=y
+CONFIG_ESP_MAIN_TASK_STACK_SIZE=148584
+CONFIG_ESP_TASK_WDT_EN=n
+CONFIG_IDF_EXPERIMENTAL_FEATURES=y
diff --git a/test_apps/esp32c5/sdkconfig.ci.esp32s2 b/test_apps/esp32c5/sdkconfig.ci.esp32s2
new file mode 100644
index 0000000..1417a36
--- /dev/null
+++ b/test_apps/esp32c5/sdkconfig.ci.esp32s2
@@ -0,0 +1,12 @@
+# This file was generated using idf.py save-defconfig. It can be edited manually.
+# Espressif IoT Development Framework (ESP-IDF) 5.4.1 Project Minimal Configuration
+#
+CONFIG_IDF_TARGET="esp32s2"
+CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
+CONFIG_ESPTOOLPY_FLASHSIZE_4MB=y
+CONFIG_PARTITION_TABLE_CUSTOM=y
+CONFIG_SR_WN_WN9S_HIESP=y
+CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y
+CONFIG_ESP_MAIN_TASK_STACK_SIZE=148584
+CONFIG_ESP_TASK_WDT_EN=n
+CONFIG_IDF_EXPERIMENTAL_FEATURES=y