feat: update esp32c3,esp32c5,esp32c6,esp32s2 lib to support afe

2025-09-15 15:28:44 +08:00 · 2025-06-20 15:02:52 +08:00 · 2025-06-20 15:02:52 +08:00 · 8636daabf6
commit 8636daabf6
parent 127e75d617
160 changed files with 6646 additions and 169 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,4 +1,4 @@
-if((${IDF_TARGET} STREQUAL "esp32s3") OR (${IDF_TARGET} STREQUAL "esp32p4") OR (${IDF_TARGET} STREQUAL "esp32"))
+if((${IDF_TARGET} STREQUAL "esp32s3") OR (${IDF_TARGET} STREQUAL "esp32p4") OR (${IDF_TARGET} STREQUAL "esp32") OR (${IDF_TARGET} STREQUAL "esp32c3") OR (${IDF_TARGET} STREQUAL "esp32c5") OR (${IDF_TARGET} STREQUAL "esp32c6") OR  (${IDF_TARGET} STREQUAL "esp32s2"))
    set(include_dirs
        "esp-tts/esp_tts_chinese/include"
        "include/${IDF_TARGET}"
@ -46,9 +46,9 @@ if((${IDF_TARGET} STREQUAL "esp32s3") OR (${IDF_TARGET} STREQUAL "esp32p4") OR (
    idf_component_get_property(dl_fft_lib espressif__dl_fft COMPONENT_LIB)

    set(sr_libs 
-        dl_lib
        $<TARGET_FILE:${esp_dsp_lib}>
        $<TARGET_FILE:${dl_fft_lib}>
+        dl_lib
        c_speech_features
        esp_audio_front_end
        esp_audio_processor
@ -72,48 +72,6 @@ if((${IDF_TARGET} STREQUAL "esp32s3") OR (${IDF_TARGET} STREQUAL "esp32p4") OR (
        "-Wl,--end-group")


-elseif((${IDF_TARGET} STREQUAL "esp32c5") OR (${IDF_TARGET} STREQUAL "esp32c3") OR (${IDF_TARGET} STREQUAL "esp32c6") OR  (${IDF_TARGET} STREQUAL "esp32s2"))
-    set(srcs
-        "src/model_path.c"
-    )
-
-    set(include_dirs
-        "include/${IDF_TARGET}"
-        "src/include"
-        "esp-tts/esp_tts_chinese/include"
-    )
-
-    set(requires
-        json
-        spiffs
-        esp_partition
-    )
-
-    idf_component_register(SRCS ${srcs}
-                        INCLUDE_DIRS ${include_dirs}
-                        REQUIRES ${requires}
-                        PRIV_REQUIRES spi_flash
-                        )
-    
-    component_compile_options(-ffast-math -O3 -Wno-error=format=-Wno-format)
-    add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(dl_lib "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libdl_lib.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(c_speech_features "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libc_speech_features.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(esp_tts_chinese "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libesp_tts_chinese.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(voice_set_xiaole "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libvoice_set_xiaole.a" PRIV_REQUIRES ${COMPONENT_NAME})
-
-    target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_processor)
-    target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_front_end)
-    target_link_libraries(${COMPONENT_LIB} PRIVATE dl_lib)
-    target_link_libraries(${COMPONENT_LIB} PRIVATE c_speech_features)
-    target_link_libraries(${COMPONENT_LIB} PRIVATE hufzip)
-    target_link_libraries(${COMPONENT_LIB} PRIVATE wakenet)
-    target_link_libraries(${COMPONENT_LIB} PRIVATE esp_tts_chinese)
-    target_link_libraries(${COMPONENT_LIB} PRIVATE voice_set_xiaole)
-
 endif()

 # Add model partition and flash srmodels.bin
--- a/include/esp32/esp_mfcc_models.h
+++ b/include/esp32/esp_mfcc_models.h
@ -12,7 +12,7 @@ esp_mfcc_opts_t *get_mfcc_opts_wn9();
 /**
 * @brief Return basic opts used in wakenet9s
 **/
-esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
+esp_mfcc_opts_t *get_mfcc_opts(const char *win_type, bool use_power, int winstep_ms, int winlen_ms, int nfilter);

 /**
 * @brief Return basic opts for default kaldifeat
--- a/include/esp32c3/esp_afe_aec.h
+++ b/include/esp32c3/esp_afe_aec.h
@ -2,9 +2,8 @@
 #ifndef _ESP_AFE_AEC_H_
 #define _ESP_AFE_AEC_H_

-
-#include "esp_afe_config.h"
 #include "esp_aec.h"
+#include "esp_afe_config.h"

 #include <stdint.h>

@ -13,19 +12,19 @@ extern "C" {
 #endif

 typedef struct {
-    aec_handle_t* handle;
+    aec_handle_t *handle;
    aec_mode_t mode;
    afe_pcm_config_t pcm_config;
    int frame_size;
-    int16_t  *data;
-}afe_aec_handle_t;
-
+    int16_t *data;
+} afe_aec_handle_t;

 /**
- * @brief Creates an instance to the AEC structure. 
- * 
- * @warning Currently only support 1 microphone channel and 1 playback channe. 
- * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
+ * @brief Creates an instance to the AEC structure.
+ *
+ * @warning Currently only support 1 microphone channel and 1 playback channe.
+ * If input has multiple microphone channels and playback channels, just the first microphone channel and playback
+ * channel will be selected.
 *
 * The input format, same as afe config:
 * M to represent the microphone channel
@ -37,7 +36,8 @@ typedef struct {
 *
 * @param input_format     The input format
 * @param filter_length    The length of filter. The larger the filter, the higher the CPU loading.
- *                         Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
+ *                         Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for
+ * esp32c5.
 * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
 * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
 *
@ -45,17 +45,17 @@ typedef struct {
 */
 afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);

-
 /**
 * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
- * 
+ *
 * @param inst        The instance of AEC.
- * @param indata      Input audio data, format is define by input_format. Note indata will be modified in function call.
- * @param outdata     Returns near-end signal with echo removed. 
+ * @param indata      Input audio data, format is define by input_format.
+ * @param outdata     Near-end signal with echo removed.  outdata must be 16-bit aligned.
+ *                    please use heap_caps_aligned_calloc(16, n, size, caps) to allocate an aligned chunk of memory

 * @return The bytes of outdata.
 */
-size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
+size_t afe_aec_process(afe_aec_handle_t *handel, const int16_t *indata, int16_t *outdata);

 /**
 * @brief Get frame size of AEC (the samples of one frame)
@ -64,7 +64,6 @@ size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outda
 */
 int afe_aec_get_chunksize(afe_aec_handle_t *handle);

-
 /**
 * @brief Free the AEC instance
 *
--- a/include/esp32c3/esp_afe_config.h
+++ b/include/esp32c3/esp_afe_config.h
@ -1,9 +1,15 @@
 #pragma once
 #include "esp_aec.h"
+#include "esp_agc.h"
+#include "esp_nsn_models.h"
+#include "esp_vad.h"
+#include "esp_vadn_models.h"
+#include "esp_wn_iface.h"
+#include "esp_wn_models.h"
+#include "model_path.h"
 #include "stdbool.h"
 #include "stdint.h"
 #include "stdlib.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -27,7 +33,8 @@ typedef enum {
 // Set AFE type
 typedef enum {
    AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
-    AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
+    AFE_TYPE_VC = 1, // Voice communication scenarios, 16KHz input, including nonlinear noise suppression
+    AFE_TYPE_VC_8K = 2, // Voice communication scenarios, 8KHz input, note that the input data must be 8KHz
 } afe_type_t;

 typedef enum {
@ -62,8 +69,220 @@ typedef enum {
    AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
 } afe_agc_mode_t;

+/**
+ * @brief Function to get the debug audio data
+ *
+ * @param data        The debug audio data which don't be modify. It should be copied away as soon as possible that
+ * avoid blocking for too long.
+ * @param data_size   The number of bytes of data.
+ * @returns
+ */
+typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);
+
+typedef enum {
+    AFE_DEBUG_HOOK_MASE_TASK_IN = 0,  // To get the input data of mase task
+    AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
+    AFE_DEBUG_HOOK_MAX = 2
+} afe_debug_hook_type_t;
+
+typedef struct {
+    afe_debug_hook_type_t hook_type;         // debug type of hook
+    afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
+} afe_debug_hook_t;
+
+typedef struct {
+    /********** AEC(Acoustic Echo Cancellation) **********/
+    bool aec_init;         // Whether to init aec
+    aec_mode_t aec_mode;   // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
+    int aec_filter_length; // The filter length of aec
+
+    /********** SE(Speech Enhancement, microphone array processing) **********/
+    bool se_init; // Whether to init se
+
+    /********** NS(Noise Suppression) **********/
+    bool ns_init;              // Whether to init ns
+    char *ns_model_name;       // Model name of ns
+    afe_ns_mode_t afe_ns_mode; // Model mode of ns
+
+    /********** VAD(Voice Activity Detection) **********/
+    bool vad_init;          // Whether to init vad
+    vad_mode_t vad_mode;    // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
+    char *vad_model_name;   // The model name of vad, If it is null, WebRTC VAD will be used.
+    int vad_min_speech_ms;  // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
+    int vad_min_noise_ms;   // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
+                            // 1000 ms
+    int vad_delay_ms;       // The delay of the first speech frame in ms, default: 128 ms
+                            // If you find vad cache can not cover all speech, please increase this value.
+    bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
+    bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
+
+    /********** WakeNet(Wake Word Engine) **********/
+    bool wakenet_init;
+    char *wakenet_model_name;   // The model name of wakenet 1
+    char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
+    det_mode_t wakenet_mode;    // The mode of wakenet
+
+    /********** AGC(Automatic Gain Control) **********/
+    bool agc_init; // Whether to init agc
+    afe_agc_mode_t
+        agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
+    int agc_compression_gain_db; // Compression gain in dB (default 9)
+    int agc_target_level_dbfs;   // Target level in -dBfs of envelope (default 3, means target level is -3 dBFS)
+
+    /********** General AFE(Audio Front End) parameter **********/
+    afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
+    afe_mode_t afe_mode;         // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    afe_type_t afe_type;         // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    int afe_perferred_core;      // The preferred core of afe se task, which is created in afe_create function.
+    int afe_perferred_priority;  // The preferred priority of afe se task, which is created in afe_create function.
+    int afe_ringbuf_size;        // The ring buffer size: the number of frame data in ring buffer.
+    afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
+    float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
+                           // directly on the output amplitude: out_linear_gain * amplitude.
+    bool debug_init;
+    bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
+                              // otherwise, select channel number by wakenet
+} afe_config_t;
+
+/**
+ * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
+ * on the chip target and input format. You can manually fine-tune it after creating the configuration
+ *
+ * The input format:
+ * M to represent the microphone channel
+ * R to represent the playback reference channel
+ * N to represent an unknown or unused channel
+ *
+ * For example, input_format="MMNR" indicates that the input data consists of four channels,
+ * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
+ *
+ * @param input_format     The input format
+ * @param models           Models from partition, which is configured by Kconfig
+ * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
+ * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+ *
+ * @return afe_config_t*  The default config of afe
+ */
+afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
+
+/**
+ * @brief Check AFE configuration and make sure it is correct.
+ *
+ * @warning If there is a configuration conflict, this function will modify some parameters.
+ * The guiding behind these modifications is to maintain the highest performance of the output audio and results.
+ * And remove the conflict between different algorithms.
+ *
+ * For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
+ * If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
+ *
+ * @param afe_config       Input AFE config
+ *
+ * @return afe_config_t*  The modified AFE config
+ */
+afe_config_t *afe_config_check(afe_config_t *afe_config);
+
+/**
+ * @brief Parse input format
+ *
+ * @param input_format The input format, same with afe_config_init() function
+ * @param pcm_config   The pcm config
+ *
+ * @return true if the input format is parsed successfully, otherwise false
+ */
+bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);
+
+/**
+ * @brief Parse I2S input data
+ *
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param mic_data     The output microphone data
+ * @param ref_data     The output playback reference data
+ * @param pcm_config   The pcm config
+ *
+ */
+void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);
+
+/**
+ * @brief Parse input data, from interleaved arrangement to contiguous arrangement
+ *
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param channel_num  The channel number of data
+ * @param out_data     The output data
+ *
+ */
+void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
+
+/**
+ * @brief Format input data, from contiguous arrangement to interleaved arrangement
+ *
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param channel_num  The channel number of data
+ * @param out_data     The output data
+ *
+ */
+void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
+
+/**
+ * @brief Adjust the gain of input data
+ *
+ * @warning the input data will be modified inplace.
+ *
+ * @param data         The input audio data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param factor       The gain factor
+ *
+ * @return int16_t*    The output audio data
+ */
+int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);
+
+/**
+ * @brief Adjust the gain of input data
+ *
+ * @warning the input data will be modified inplace.
+ *
+ * @param in_data         The input audio data
+ * @param in_frame_size   Input data frame size of input
+ * @param channel_num     The channel number of input data, which is same as output data
+ * @param out_data        The output audio data
+ * @param out_frame_size  Onput data frame size of input
+ *
+ */
+void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);
+
+/**
+ * @brief Copy the afe config
+ *
+ * @param dst_config    The destination afe config
+ * @param src_config    The source afe config
+ *
+ * @return   The destination afe config
+ */
+afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
+
+/**
+ * @brief Print the afe config
+ *
+ * @param afe_config    The afe config
+ */
+void afe_config_print(const afe_config_t *afe_config);
+
+/**
+ * @brief Allocate afe config
+ *
+ * @return The afe config pointer
+ */
+afe_config_t *afe_config_alloc();
+
+/**
+ * @brief Free afe config
+ *
+ * @param afe_config  The afe config pointer
+ */
+void afe_config_free(afe_config_t *afe_config);

 #ifdef __cplusplus
 }
 #endif
-
--- a/include/esp32c3/esp_afe_doa.h
+++ b/include/esp32c3/esp_afe_doa.h
@ -0,0 +1,48 @@
+#ifndef _ESP_AFE_DOA_H_
+#define _ESP_AFE_DOA_H_
+
+#include "esp_doa.h"
+#include "esp_afe_config.h"
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    doa_handle_t *doa_handle;
+    afe_pcm_config_t pcm_config;
+    int16_t *leftdata;
+    int16_t *rightdata;
+    int frame_size;
+} afe_doa_handle_t;
+
+/**
+ * @brief Initialize SRP-PHAT processor
+ * @param input_format     The input format
+ * @param fs Sampling rate (Hz), e.g., 16000
+ * @param resolution Angular search resolution (degrees), e.g., 20
+ * @param d_mics Microphone spacing (meters), e.g., 0.06
+ * @param input_timedate_samples input timedate samples, e.g., 1024
+ * @return Initialized doa_handle_t object pointer, Recommend using the above configuration for better performance
+ */
+afe_doa_handle_t *afe_doa_create(const char *input_format, int fs, float resolution, float d_mics, int input_timedate_samples);
+/**
+ * @brief Process audio frame for direction estimation
+ * @param handle doa_handle_t instance pointer
+ * @param indata Input audio data, format is define by input_format.
+ * @return Estimated sound direction in degrees, e.g., 0-180
+ */
+float afe_doa_process(afe_doa_handle_t *handle, const int16_t *indata);
+/**
+ * @brief Release all allocated resources
+ * @param doa doa_handle_t instance pointer to be freed
+ */
+void afe_doa_destroy(afe_doa_handle_t *handle);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ESP_AFE_DOA_H_ */
--- a/include/esp32c3/esp_afe_sr_iface.h
+++ b/include/esp32c3/esp_afe_sr_iface.h
@ -0,0 +1,237 @@
+#pragma once
+#include "esp_afe_config.h"
+#include "stdbool.h"
+#include "stdint.h"
+#include "stdlib.h"
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// AFE: Audio Front-End
+// SR:  Speech Recognition
+// afe_sr/AFE_SR: the audio front-end for speech recognition
+
+// Opaque AFE_SR data container
+typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
+
+/**
+ * @brief The state of vad
+ */
+typedef enum {
+    AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
+    AFE_VAD_SPEECH = 1   // Deprecated, please use vad_state_t, speech
+} afe_vad_state_t;
+
+/**
+ * @brief The result of fetch function
+ */
+typedef struct afe_fetch_result_t {
+    int16_t *data;      // the target channel data of audio.
+    int data_size;      // the size of data. The unit is byte.
+    int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
+                        // audio that was truncated.
+    int vad_cache_size; // the size of vad_cache. The unit is byte.
+    float data_volume;  // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
+                        // (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
+                        // wakenet(about 1.5s), otherwise is the frame length.
+    wakenet_state_t wakeup_state; // the value is wakenet_state_t
+    int wake_word_index;          // if the wake word is detected. It will store the wake word index which start from 1.
+    int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
+                             // start from 1.
+    vad_state_t vad_state;   // the value is afe_vad_state_t
+    int trigger_channel_id;  // the channel index of output
+    int wake_word_length;    // the length of wake word. The unit is the number of samples.
+    int ret_value;           // the return state of fetch function
+    int16_t *raw_data;       // the multi-channel output data of audio.
+    int raw_data_channels;   // the channel number of raw data
+    float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is buzy. 
+    void *reserved;          // reserved for future use
+} afe_fetch_result_t;
+
+/**
+ * @brief Function to initialze a AFE_SR instance
+ *
+ * @param afe_config        The config of AFE_SR
+ * @returns Handle to the AFE_SR data
+ */
+typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
+
+/**
+ * @brief Get the amount of each channel samples per frame that need to be passed to the function
+ *
+ * Every speech enhancement AFE_SR processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param afe The AFE_SR object to query
+ * @return The amount of samples to feed the fetch function
+ */
+typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Get the channel number
+ *
+ * @param afe   The AFE_SR object to query
+ * @return      The amount of total channels
+ */
+typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the function
+ *
+ * @param afe   The AFE_SR object to query
+ * @return      The sample rate, in hz
+ */
+typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Feed samples of an audio stream to the AFE_SR
+ *
+ * @Warning  The input data should be arranged in the format of channel interleaving.
+ *           The last channel is reference signal if it has reference data.
+ *
+ * @param afe   The AFE_SR object to query
+ *
+ * @param in    The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
+ *              `get_feed_chunksize`.
+ * @return      The size of input
+ */
+typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);
+
+/**
+ * @brief fetch enhanced samples of an audio stream from the AFE_SR
+ *
+ * @Warning  The output is single channel data, no matter how many channels the input is.
+ *           Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
+ *
+ * @param afe   The AFE_SR object to query
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
+ * audio can be queried by the `get_fetch_chunksize`.)
+ */
+typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
+ *
+ * @Warning  The output is single channel data, no matter how many channels the input is.
+ *
+ * @param afe            The AFE_SR object to query
+ * @param ticks_to_wait  The timeout value, in ticks, to wait for the fetch result.
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
+ * audio can be queried by the `get_fetch_chunksize`.)
+ */
+typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
+
+/**
+ * @brief reset ringbuf of AFE.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             -1: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Set wakenet detection threshold 
+ * 
+ * @param afe           The AFE_SR object to query
+ * @param index         The wakenet index, just support 1: wakenet1 or  2: wakenet2
+ * @param threshold     The wakenet detection threshold, the value is between 0.4 and 0.9999.
+ * @return             -1: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_set_wakenet_threshold_t)(esp_afe_sr_data_t *afe, int index, float threshold);
+
+/**
+ * @brief Reset wakenet detection threshold to inital state
+ * 
+ * @param afe           The AFE_SR object to query
+ * @param index         The wakenet index, just support 1: wakenet1 or  2: wakenet2
+ * @return             -1: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_reset_wakenet_threshold_t)(esp_afe_sr_data_t *afe, int index);
+
+/**
+ * @brief Reset one function/module/algorithm.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             -1: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Disable one function/module/algorithm.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             -1: fail, 0: disabled, 1: enabled
+ */
+typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Enable one function/module/algorithm.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             -1: fail, 0: disabled, 1: enabled
+ */
+typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Print all functions/modules/algorithms pipeline.
+ *       The pipeline is the order of the functions/modules/algorithms.
+ *       The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output]
+ *
+ * @param afe          The AFE_SR object to query
+ */
+typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Destroy a AFE_SR instance
+ *
+ * @param afe         AFE_SR object to destroy
+ */
+typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * This structure contains the functions used to do operations on a AFE_SR.
+ */
+typedef struct {
+    esp_afe_sr_iface_op_create_from_config_t create_from_config;
+    esp_afe_sr_iface_op_feed_t feed;
+    esp_afe_sr_iface_op_fetch_t fetch;
+    esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay;
+    esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
+    esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
+    esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
+    esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
+    esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
+    esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
+    esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
+    esp_afe_sr_iface_op_set_wakenet_threshold_t set_wakenet_threshold;
+    esp_afe_sr_iface_op_reset_wakenet_threshold_t reset_wakenet_threshold;
+    esp_afe_sr_iface_op_disable_func_t disable_wakenet;
+    esp_afe_sr_iface_op_enable_func_t enable_wakenet;
+    esp_afe_sr_iface_op_disable_func_t disable_aec;
+    esp_afe_sr_iface_op_enable_func_t enable_aec;
+    esp_afe_sr_iface_op_disable_func_t disable_se;
+    esp_afe_sr_iface_op_enable_func_t enable_se;
+    esp_afe_sr_iface_op_disable_func_t disable_vad;
+    esp_afe_sr_iface_op_enable_func_t enable_vad;
+    esp_afe_sr_iface_op_reset_op_t reset_vad;
+    esp_afe_sr_iface_op_disable_func_t disable_ns;
+    esp_afe_sr_iface_op_enable_func_t enable_ns;
+    esp_afe_sr_iface_op_disable_func_t disable_agc;
+    esp_afe_sr_iface_op_enable_func_t enable_agc;
+    esp_afe_sr_iface_op_print_pipeline_t print_pipeline;
+    esp_afe_sr_iface_op_destroy_t destroy;
+} esp_afe_sr_iface_t;
+
+// struct is used to store the AFE handle and data for the AFE task
+typedef struct {
+    esp_afe_sr_data_t *afe_data;
+    esp_afe_sr_iface_t *afe_handle;
+    TaskHandle_t feed_task;
+    TaskHandle_t fetch_task;
+} afe_task_into_t;
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c3/esp_afe_sr_models.h
+++ b/include/esp32c3/esp_afe_sr_models.h
@ -0,0 +1,13 @@
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "esp_afe_sr_iface.h"
+
+esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c3/esp_agc.h
+++ b/include/esp32c3/esp_agc.h
@ -0,0 +1,47 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_AGC_H_
+#define _ESP_AGC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+////all positive value is valid, negective is error
+typedef enum {
+    ESP_AGC_SUCCESS = 0,   ////success
+    ESP_AGC_FAIL = -1, ////agc fail
+    ESP_AGC_SAMPLE_RATE_ERROR = -2,  ///sample rate can be only 8khz, 16khz, 32khz
+    ESP_AGC_FRAME_SIZE_ERROR = -3,   ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size
+} ESP_AGE_ERR;
+
+typedef enum {
+    AGC_MODE_SR = -1,      // Bypass WEBRTC AGC
+    AGC_MODE_0 = 0,        // Only saturation protection
+    AGC_MODE_1 = 1,        // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
+    AGC_MODE_2 = 2,        // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
+    AGC_MODE_3 = 3,        // Fixed Digital Gain [compressionGaindB (default 8 dB)]
+} agc_mode_t;
+
+void *esp_agc_open(agc_mode_t agc_mode, int sample_rate);
+void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs);
+int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate);
+void esp_agc_close(void *agc_handle);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _ESP_AGC_H_
--- a/include/esp32c3/esp_doa.h
+++ b/include/esp32c3/esp_doa.h
@ -0,0 +1,41 @@
+#ifndef _ESP_DOA_H_
+#define _ESP_DOA_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct doa_handle_t doa_handle_t;
+/**
+ * @brief Initialize SRP-PHAT processor
+ * @param fs Sampling rate (Hz), e.g., 16000
+ * @param resolution Angular search resolution (degrees), e.g., 20
+ * @param d_mics Microphone spacing (meters), e.g., 0.06
+ * @param input_timedate_samples input timedate samples, e.g., 1024
+ * @return Initialized doa_handle_t object pointer, Recommend using the above configuration for better performance
+ */
+doa_handle_t *esp_doa_create(int fs, float resolution, float d_mics, int input_timedate_samples);
+
+/**
+ * @brief Release all allocated resources
+ * @param doa doa_handle_t instance pointer to be freed
+ */
+void esp_doa_destroy(doa_handle_t *doa);
+
+/**
+ * @brief Process audio frame for direction estimation
+ * @param doa doa_handle_t instance pointer
+ * @param left Left channel 16-bit PCM data
+ * @param right Right channel 16-bit PCM data
+ * @return Estimated sound direction in degrees, e.g., 0-180
+ */
+float esp_doa_process(doa_handle_t *doa, int16_t* left, int16_t* right);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ESP_DOA_H_ */
--- a/include/esp32c3/esp_mase.h
+++ b/include/esp32c3/esp_mase.h
@ -0,0 +1,93 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_MASE_H_
+#define _ESP_MASE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MASE_SAMPLE_RATE 16000        // Supports 16kHz only
+#define MASE_FRAME_SIZE 16            // Supports 16ms only
+#define MASE_MIC_DISTANCE 65          // According to physical design of mic-array
+
+/**
+ * @brief Sets mic-array type, currently 2-mic line array and 3-mic circular array 
+ * are supported.
+ */
+typedef enum {
+    TWO_MIC_LINE = 0,
+    THREE_MIC_CIRCLE = 1
+} mase_mic_array_type_t;
+
+/**
+ * @brief Sets operating mode, supporting normal mode and wake-up enhancement mode
+ */
+typedef enum {
+    NORMAL_ENHANCEMENT_MODE = 0,
+    WAKE_UP_ENHANCEMENT_MODE = 1
+} mase_op_mode_t;
+
+typedef void* mase_handle_t;
+
+/**
+ * @brief Creates an instance to the MASE structure.
+ *
+ * @param sample_rate       The sampling frequency (Hz) must be 16000.
+ *
+ * @param frame_size        The length of the audio processing must be 16ms.
+ *
+ * @param array_type        '0' for 2-mic line array and '1' for 3-mic circular array.
+ *
+ * @param mic_distance      The distance between neiboring microphones in mm.
+ *
+ * @param operating_mode	'0' for normal mode and '1' for wake-up enhanced mode.
+ *
+ * @param filter_strength	Strengh of the mic-array speech enhancement, must be 0, 1, 2 or 3.
+ * 
+ * @return
+ *         - NULL: Create failed
+ *         - Others: An instance of MASE
+ */
+mase_handle_t mase_create(int fs, int frame_size, int array_type, float mic_distance, int operating_mode, int filter_strength);
+
+/**
+ * @brief Performs mic array processing for one frame.
+ *
+ * @param inst        The instance of MASE.
+ *
+ * @param in          An array of 16-bit signed audio samples from mic.
+ *
+ * @param dsp_out     Returns enhanced signal.
+ *
+ * @return None
+ *
+ */
+void mase_process(mase_handle_t st, int16_t *in, int16_t *dsp_out);
+
+/**
+ * @brief Free the MASE instance
+ *
+ * @param inst The instance of MASE.
+ *
+ * @return None
+ *
+ */
+void mase_destory(mase_handle_t st);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/include/esp32c3/esp_mfcc_models.h
+++ b/include/esp32c3/esp_mfcc_models.h
@ -12,7 +12,7 @@ esp_mfcc_opts_t *get_mfcc_opts_wn9();
 /**
 * @brief Return basic opts used in wakenet9s
 **/
-esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
+esp_mfcc_opts_t *get_mfcc_opts(const char *win_type, bool use_power, int winstep_ms, int winlen_ms, int nfilter);

 /**
 * @brief Return basic opts for default kaldifeat
--- a/include/esp32c3/esp_mn_iface.h
+++ b/include/esp32c3/esp_mn_iface.h
@ -0,0 +1,223 @@
+#pragma once
+#include "stdint.h"
+#include "esp_wn_iface.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ESP_MN_RESULT_MAX_NUM 5
+#define ESP_MN_MAX_PHRASE_NUM 400
+#define ESP_MN_MAX_PHRASE_LEN 63
+#define ESP_MN_MIN_PHRASE_LEN 2
+
+#define ESP_MN_PREFIX "mn"
+#define ESP_MN_ENGLISH "en"
+#define ESP_MN_CHINESE "cn"
+
+typedef enum {
+    ESP_MN_STATE_DETECTING = 0,     // detecting
+    ESP_MN_STATE_DETECTED = 1,      // detected
+    ESP_MN_STATE_TIMEOUT = 2,       // time out
+} esp_mn_state_t;
+
+//Set multinet loading mode
+//The memory comsumption is decreased with increasing mode,
+//As a consequence also the CPU loading rate goes up
+typedef enum {
+    ESP_MN_LOAD_FROM_PSRAM = 0,          // Load all weights from PSRAM. Fastest computation with Maximum memory consumption
+    ESP_MN_LOAD_FROM_PSRAM_FLASH = 1,    // Load some weights from PSRAM and laod the rest from FLASH (default)
+    ESP_MN_LOAD_FROM_FLASH = 2,          // Load more weights from FLASH. Minimum memory consumption with slowest computation
+} esp_mn_loader_mode_t;
+
+typedef enum {
+    ESP_MN_GREEDY_SEARCH = 0,          // greedy search
+    ESP_MN_BEAM_SEARCH = 1,            // beam search
+    ESP_MN_BEAM_SEARCH_WITH_FST = 2,  // beam search with trie language model
+} esp_mn_search_method_t;
+
+typedef enum {
+    CHINESE_ID = 1,       // Chinese language
+    ENGLISH_ID = 2,       // English language
+} language_id_t;
+
+// Return all possible recognition results
+typedef struct{
+    esp_mn_state_t state;
+    int num;                                   // The number of phrase in list, num<=5. When num=0, no phrase is recognized.
+    int command_id[ESP_MN_RESULT_MAX_NUM];     // The list of command id.
+    int phrase_id[ESP_MN_RESULT_MAX_NUM];      // The list of phrase id.
+    float prob[ESP_MN_RESULT_MAX_NUM];         // The list of probability.
+    char string[256];
+} esp_mn_results_t;
+
+typedef struct {
+    char *string;                               // command string
+    char *phonemes;                             // command phonemes, if applicable
+    int16_t command_id;                         // the command id
+    float threshold;                            // trigger threshold, default: 0
+    int16_t *wave;                              // prompt wave data of the phrase
+} esp_mn_phrase_t;
+
+typedef struct _mn_node_ {
+    esp_mn_phrase_t *phrase;
+    struct _mn_node_ *next;
+} esp_mn_node_t;
+
+typedef struct{
+    int16_t num;                                // The number of error phrases, which can not added into model
+    esp_mn_phrase_t **phrases;                  // The array of error phrase pointer
+} esp_mn_error_t;
+
+/**
+ * @brief Initialze a model instance with specified model name.
+ *
+ * @param model_name  The wakenet model name.
+ * @param duration    The duration (ms) to trigger the timeout
+ *
+ * @returns Handle to the model data.
+ */
+typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const char *model_name, int duration);
+
+/**
+ * @brief Switch multinet mode to change memory consumption and CPU loading
+ *
+ * @warning Just Support multinet6 or later versions
+ *
+ * @param model The model object to query
+ * @param mode  The multinet loader mode
+ *
+ * @returns Handle to the model data.
+ */
+typedef model_iface_data_t* (*esp_mn_iface_op_switch_loader_mode_t)(model_iface_data_t *model, esp_mn_loader_mode_t mode);
+
+/**
+ * @brief Callback function type to fetch the amount of samples that need to be passed to the detect function
+ *
+ * Every speech recognition model processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model       The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_mn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
+
+/**
+ * @brief Callback function type to fetch the number of frames recognized by the command word
+ *
+ * @param model       The model object to query
+ * @return The number of the frames recognized by the command word
+ */
+typedef int (*esp_mn_iface_op_get_samp_chunknum_t)(model_iface_data_t *model);
+
+/**
+ * @brief Set the detection threshold to manually abjust the probability
+ *
+ * @param model The model object to query
+ * @param det_treshold The threshold to trigger speech commands, the range of det_threshold is 0.0~0.9999
+ */
+typedef int (*esp_mn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the detect function
+ *
+ * @param model       The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_mn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the language of model
+ *
+ * @param model       The language name
+ * @return Language name string defined in esp_mn_models.h, eg: ESP_MN_CHINESE, ESP_MN_ENGLISH
+ */
+typedef char * (*esp_mn_iface_op_get_language_t)(model_iface_data_t *model);
+
+/**
+ * @brief Feed samples of an audio stream to the speech recognition model and detect if there is a speech command found.
+ *
+ * @param model       The model object to query.
+ * @param samples     An array of 16-bit signed audio samples. The array size used can be queried by the
+ *                    get_samp_chunksize function.
+ * @return The state of multinet
+ */
+typedef esp_mn_state_t (*esp_mn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
+
+/**
+ * @brief Destroy a speech commands recognition model
+ *
+ * @param model       The Model object to destroy
+ */
+typedef void (*esp_mn_iface_op_destroy_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get recognition results
+ *
+ * @param model       The Model object to query
+ *
+ * @return The current results.
+ */
+typedef esp_mn_results_t* (*esp_mn_iface_op_get_results_t)(model_iface_data_t *model);
+
+/**
+ * @brief Open the log print
+ *
+ * @param model_data       The model object to query.
+ *
+ */
+typedef void (*esp_mn_iface_op_open_log_t)(model_iface_data_t *model_data);
+
+/**
+ * @brief Clean all status of model
+ *
+ * @param model_data       The model object to query.
+ *
+ */
+typedef void (*esp_mn_iface_op_clean_t)(model_iface_data_t *model_data);
+
+/**
+ * @brief Set the speech commands by mn_command_root
+ *
+ * @param model_data       The model object to query.
+ * @param mn_command_root  The speech commands link.
+ * @return The error phrase id info.
+ */
+typedef esp_mn_error_t* (*esp_wn_iface_op_set_speech_commands)(model_iface_data_t *model_data, esp_mn_node_t *mn_command_root);
+
+
+/**
+ * @brief Print out current commands in fst, note the ones "added" but not "updated" will not be shown here
+ *
+ * @param model_data     The model object to query
+*/
+typedef void (*esp_mn_iface_op_print_active_speech_commands)(model_iface_data_t *model_data);
+
+/**
+ * @brief Check if input string can be tokenized
+ *
+ * @param model_data     The model object to query
+ * @param str            The input string
+*/
+typedef int (*esp_mn_iface_op_check_speech_command)(model_iface_data_t *model_data, const char *str);
+
+typedef struct {
+    esp_mn_iface_op_create_t create;
+    esp_mn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_mn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_mn_iface_op_get_samp_chunknum_t get_samp_chunknum;
+    esp_mn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_mn_iface_op_get_language_t get_language;
+    esp_mn_iface_op_detect_t detect;
+    esp_mn_iface_op_destroy_t destroy;
+    esp_mn_iface_op_get_results_t get_results;
+    esp_mn_iface_op_open_log_t open_log;
+    esp_mn_iface_op_clean_t clean;
+    esp_wn_iface_op_set_speech_commands set_speech_commands;
+    esp_mn_iface_op_switch_loader_mode_t switch_loader_mode;
+    esp_mn_iface_op_print_active_speech_commands print_active_speech_commands;
+    esp_mn_iface_op_check_speech_command check_speech_command;
+} esp_mn_iface_t;
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c3/esp_mn_models.h
+++ b/include/esp32c3/esp_mn_models.h
@ -0,0 +1,66 @@
+#pragma once
+#include "esp_mn_iface.h"
+
+//Contains declarations of all available speech recognion models. Pair this up with the right coefficients and you have a model that can recognize
+//a specific phrase or word.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * @brief Get the multinet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of multinet
+ */
+esp_mn_iface_t *esp_mn_handle_from_name(char *model_name);
+
+/**
+ * @brief Get the multinet language from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The language of multinet
+ */
+char *esp_mn_language_from_name(char *model_name);
+
+/*
+ Configure wake word to use based on what's selected in menuconfig.
+*/
+
+#ifdef CONFIG_SR_MN_CN_MULTINET2_SINGLE_RECOGNITION
+#include "multinet2_ch.h"
+#define MULTINET_COEFF get_coeff_multinet2_ch
+#define MULTINET_MODEL_NAME "mn2_cn"
+
+#else
+#define MULTINET_COEFF      "COEFF_NULL"
+#define MULTINET_MODEL_NAME "NULL"
+#endif
+
+
+/* example
+
+static const esp_mn_iface_t *multinet = &MULTINET_MODEL;
+
+//Initialize MultiNet model data
+model_iface_data_t *model_data = multinet->create(&MULTINET_COEFF);
+add_speech_commands(multinet, model_data);
+
+//Set parameters of buffer
+int audio_chunksize=model->get_samp_chunksize(model_data);
+int frequency = model->get_samp_rate(model_data);
+int16_t *buffer=malloc(audio_chunksize*sizeof(int16_t));
+
+//Detect
+int r=model->detect(model_data, buffer);
+if (r>0) {
+    printf("Detection triggered output %d.\n",  r);
+}
+
+//Destroy model
+model->destroy(model_data)
+
+*/
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c3/esp_ns.h
+++ b/include/esp32c3/esp_ns.h
@ -0,0 +1,86 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_NS_H_
+#define _ESP_NS_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NS_USE_SPIARM       0
+#define NS_FRAME_LENGTH_MS     10          //Supports 10ms, 20ms, 30ms
+
+/**
+* The Sampling frequency (Hz) must be 16000Hz
+*/
+
+typedef void* ns_handle_t;
+
+/**
+ * @brief Creates an instance to the NS structure.
+ *
+ * @param frame_length   The length of the audio processing can be 10ms, 20ms, 30ms.
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of NS
+ */
+ns_handle_t ns_create(int frame_length);
+
+/**
+ * @brief Creates an instance of the more powerful noise suppression algorithm.
+ * 
+ * @warning frame_length only supports be 10 ms.
+ *
+ * @param frame_length    The length of the audio processing can only be 10ms.
+ * @param mode            0: Mild, 1: Medium, 2: Aggressive
+ * @param sample_rate     The sample rate of the audio. 
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of NS
+ */
+ns_handle_t ns_pro_create(int frame_length, int mode, int sample_rate);
+
+/**
+ * @brief Feed samples of an audio stream to the NS and get the audio stream after Noise suppression.
+ *
+ * @param inst        The instance of NS.
+ *
+ * @param indata      An array of 16-bit signed audio samples.
+ *
+ * @param outdata     An array of 16-bit signed audio samples after noise suppression.
+ *
+ * @return None
+ *
+ */
+void ns_process(ns_handle_t inst, int16_t *indata, int16_t *outdata);
+
+/**
+ * @brief Free the NS instance
+ *
+ * @param inst The instance of NS.
+ *
+ * @return None
+ *
+ */
+void ns_destroy(ns_handle_t inst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_NS_H_
--- a/include/esp32c3/esp_nsn_iface.h
+++ b/include/esp32c3/esp_nsn_iface.h
@ -0,0 +1,64 @@
+#pragma once
+#include "stdint.h"
+
+//Opaque model data container
+typedef struct esp_nsn_data_t esp_nsn_data_t;
+
+
+/**
+ * @brief Easy function type to initialze a model instance
+ *
+ * @param model_name The name of the model instance
+ * @returns Handle to the model data
+ */
+typedef esp_nsn_data_t* (*esp_nsn_iface_op_create_t)(char *model_name);
+
+/**
+ * @brief Get the amount of samples that need to be passed to the process function
+ *
+ * Every noise suppression model processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the process function
+ */
+typedef int (*esp_nsn_iface_op_get_samp_chunksize_t)(esp_nsn_data_t *model);
+
+/**
+ * @brief Feed samples of an audio stream to the noise suppression model and get data after process.
+ *
+ *
+ * @param model The model object to query
+ * @param in_data An array of 16-bit signed audio samples. The array size used can be queried by the 
+ *        get_samp_chunksize function.
+ * @param out_data An array of 16-bit signed audio samples after process.
+ * @return The state of return.
+ */
+typedef int (*esp_nsn_iface_op_process_t)(esp_nsn_data_t *model, int16_t *in_data, int16_t *out_data);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the process function
+ *
+ * @param model The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_nsn_iface_op_get_samp_rate_t)(esp_nsn_data_t *model);
+
+/**
+ * @brief Destroy a noise suppression model
+ *
+ * @param model Model object to destroy
+ */
+typedef void (*esp_nsn_iface_op_destroy_t)(esp_nsn_data_t *model);
+
+
+/**
+ * This structure contains the functions used to do operations on a wake word detection model.
+ */
+typedef struct {
+    esp_nsn_iface_op_create_t create;
+    esp_nsn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_nsn_iface_op_process_t process;
+    esp_nsn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_nsn_iface_op_destroy_t destroy;
+} esp_nsn_iface_t;
--- a/include/esp32c3/esp_nsn_models.h
+++ b/include/esp32c3/esp_nsn_models.h
@ -0,0 +1,17 @@
+#pragma once
+
+#include "esp_nsn_iface.h"
+
+/*
+The prefix of nset
+Now there are nsnet1 and nsnet2
+*/
+#define ESP_NSNET_PREFIX "nsnet"
+
+/**
+ * @brief Get the nsnet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of multinet
+ */
+esp_nsn_iface_t *esp_nsnet_handle_from_name(char *model_name);
--- a/include/esp32c3/esp_speech_features.h
+++ b/include/esp32c3/esp_speech_features.h
@ -48,7 +48,7 @@ float *esp_win_func_init(char *win_type, float *window_data, int frame_length);

 float *esp_fftr(float *x, int nfft, void *fft_table);

-float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
+float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_handle);

 void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);

--- a/include/esp32c3/esp_sr_webrtc.h
+++ b/include/esp32c3/esp_sr_webrtc.h
@ -0,0 +1,84 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_WEBRTC_H_
+#define _ESP_WEBRTC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "esp_agc.h"
+#include "esp_log.h"
+#include "esp_ns.h"
+#include "sr_ringbuf.h"
+#include <stdint.h>
+
+#include "esp_heap_caps.h"
+
+typedef struct {
+    void *ns_handle;
+    void *agc_handle;
+    int frame_size;
+    int sample_rate;
+    int16_t *buff;
+    int16_t *out_data;
+    sr_ringbuf_handle_t rb;
+} webrtc_handle_t;
+
+/**
+ * @brief Creates an instance of webrtc.
+ *
+ * @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms.
+ *
+ * @param frame_length_ms    The length of the audio processing
+ * @param ns_mode            The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive
+ * @param agc_mode           The model of AGC
+ * @param agc_gain           The gain of AGC. default is 9
+ * @param agc_target_level   The target level of AGC. default is -3 dbfs
+ * @param sample_rate        The sample rate of the audio.
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of webrtc
+ */
+webrtc_handle_t *webrtc_create(
+    int frame_length_ms, int ns_mode, agc_mode_t agc_mode, int agc_gain, int agc_target_level, int sample_rate);
+
+/**
+ * @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression.
+ *
+ * @param handle        The instance of NS.
+ * @param in_data       An array of 16-bit signed audio samples.
+ * @param out_size      The sample size of output data
+ * @param enable_ns     Enable noise suppression
+ * @param enable_agc    Enable automatic gain control
+ *
+ * @return data after noise suppression
+ */
+int16_t *webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc);
+
+/**
+ * @brief Free the webrtc instance
+ *
+ * @param handle The instance of webrtc.
+ *
+ * @return None
+ *
+ */
+void webrtc_destroy(webrtc_handle_t *handle);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_NS_H_
--- a/include/esp32c3/esp_vad.h
+++ b/include/esp32c3/esp_vad.h
@ -0,0 +1,178 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_VAD_H_
+#define _ESP_VAD_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SAMPLE_RATE_HZ 16000   // Supports 32000, 16000, 8000
+#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms
+
+/**
+ * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
+ * restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
+ */
+typedef enum {
+    VAD_MODE_0 = 0, // Normal
+    VAD_MODE_1,     // Aggressive
+    VAD_MODE_2,     // Very Aggressive
+    VAD_MODE_3,     // Very Very Aggressive
+    VAD_MODE_4      // Very Very Very Aggressive
+} vad_mode_t;
+
+typedef enum {
+    VAD_SILENCE = 0,
+    VAD_SPEECH = 1,
+} vad_state_t;
+
+typedef struct vad_trigger_tag {
+    vad_state_t state;
+    unsigned int min_speech_len;
+    unsigned int noise_len;
+    unsigned int min_noise_len;
+    unsigned int speech_len;
+} vad_trigger_t;
+
+#define vad_MAX_LEN INT32_MAX - 1
+/**
+ * @brief Allocate wakenet trigger
+ *
+ * @param min_speech_len  Minimum frame number of speech duration
+ * @param min_noise_len   Minimum frame number of noise duration
+ *
+ * @return Trigger pointer
+ **/
+vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
+
+/**
+ * @brief Free wakenet trigger
+ **/
+void vad_trigger_free(vad_trigger_t *trigger);
+
+/**
+ * @brief Reset wakenet trigger
+ **/
+void vad_trigger_reset(vad_trigger_t *trigger);
+
+/**
+ * @brief detect activaty voice by trigger
+ **/
+vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
+
+typedef struct {
+    vad_trigger_t *trigger;
+    void *vad_inst;
+    int sample_rate;
+    int frame_size;
+} vad_handle_with_trigger_t;
+
+typedef vad_handle_with_trigger_t *vad_handle_t;
+
+// typedef vad_handle_tag * vad_handle_t;
+
+/**
+ * @brief Creates an instance to the VAD structure.
+ *
+ * @param vad_mode          Sets the VAD operating mode.
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of VAD
+ */
+vad_handle_t vad_create(vad_mode_t vad_mode);
+
+/**
+ * @brief Creates an instance to the VAD structure.
+ *
+ * @param vad_mode          Sets the VAD operating mode.
+ * @param sample_rate       Sample rate in Hz
+ * @param one_frame_ms      Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30.
+ * @param min_speech_ms     Minimum speech duration, unit is ms
+ * @param min_noise_ms      Minimum noise duration, unit is ms
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of VAD
+ */
+vad_handle_t vad_create_with_param(
+    vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
+
+/**
+ * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
+ *
+ * @param handle            The instance of VAD.
+ * @param data              An array of 16-bit signed audio samples.
+ * @param sample_rate_hz    The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
+ * @param one_frame_ms      The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
+ * @return
+ *         - VAD_SILENCE if no voice
+ *         - VAD_SPEECH  if voice is detected
+ *
+ */
+vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms);
+
+/**
+ * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
+ *
+ * @param handle            The instance of VAD.
+ * @param data              An array of 16-bit signed audio samples.
+ * @return
+ *         - VAD_SILENCE if no voice
+ *         - VAD_SPEECH  if voice is detected
+ *
+ */
+vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
+
+/**
+ * @brief Reset trigger state as Silence
+ *
+ * @param handle            The instance of VAD.
+ */
+void vad_reset_trigger(vad_handle_t handle);
+
+/**
+ * @brief Free the VAD instance
+ *
+ * @param inst The instance of VAD.
+ *
+ * @return None
+ *
+ */
+void vad_destroy(vad_handle_t inst);
+
+/*
+ * Programming Guide:
+ *
+ * @code{c}
+ * vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);     // Creates an instance to
+ * the VAD structure.
+ *
+ * while (1) {
+ *    //Use buffer to receive the audio data from MIC.
+ *    vad_state_t vad_state = vad_process(vad_inst, buffer);      // Feed samples to the VAD process and get the result.
+ * }
+ *
+ * vad_destroy(vad_inst);   // Free the VAD instance at the end of whole VAD process
+ *
+ * @endcode
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_VAD_H_
--- a/include/esp32c3/esp_vadn_iface.h
+++ b/include/esp32c3/esp_vadn_iface.h
@ -0,0 +1,164 @@
+#pragma once
+#include "esp_vad.h"
+#include "stdint.h"
+#include "dl_lib_convq_queue.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque model data container
+typedef struct model_iface_data_t model_iface_data_t;
+
+// /**
+//  * @brief The state of vad
+//  */
+// typedef enum {
+//     VAD_NOISE = -1,  // Noise
+//     VADNET_STATE_SILENCE = 0, // Silence
+//     VAD_SPEECH = 1   // Speech
+// } vad_state_t;
+
+/**
+ * @brief Easy function type to initialze a model instance with a detection mode
+ * and specified model name
+ *
+ * @param model_name  The specified model name
+ * @param mode        The voice activity detection mode
+ * @param channel_num The number of input audio channels
+ * @param min_speech_ms  The minimum duration of speech in ms to trigger vad
+ * speech
+ * @param min_noise_ms   The minimum duration of noise in ms to trigger vad
+ * noise
+ * @returns Handle to the model data
+ */
+typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)(
+    const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms);
+
+/**
+ * @brief Get the amount of samples that need to be passed to the detect
+ * function
+ *
+ * Every speech recognition model processes a certain number of samples at the
+ * same time. This function can be used to query that amount. Note that the
+ * returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the channel number of samples that need to be passed to the detect
+ * function
+ *
+ * Every speech recognition model processes a certain number of samples at the
+ * same time. This function can be used to query that amount. Note that the
+ * returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the detect function
+ *
+ * @param model The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
+
+/**
+ * @brief Set the detection threshold to manually abjust the probability
+ *
+ * @param model The model object to query
+ * @param det_treshold The threshold to trigger wake words, the range of
+ * det_threshold is 0.5~0.9999
+ * @return 0: setting failed, 1: setting success
+ */
+typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
+
+/**
+ * @brief Get the voice activity detection threshold
+ *
+ * @param model The model object to query
+ * @returns the detection threshold
+ */
+typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model);
+
+/**
+ * @brief Feed samples of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param samples An array of 16-bit signed audio samples. The array size used
+ * can be queried by the get_samp_chunksize function.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
+
+/**
+ * @brief Feed MFCC of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param cq An array of 16-bit MFCC.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq);
+
+/**
+ * @brief Get MFCC of an audio stream
+ *
+ * @param model The model object to query
+ * @return MFCC data
+ */
+typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the triggered channel index. Channel index starts from zero
+ *
+ * @param model The model object to query
+ * @return The channel index
+ */
+typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
+
+/**
+ * @brief Clean all states of model
+ *
+ * @param model The model object to query
+ */
+typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model);
+
+/**
+ * @brief Destroy a model object
+ *
+ * @param model Model object to destroy
+ */
+typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model);
+
+/**
+ * This structure contains the functions used to do operations on a voice
+ * activity detection model.
+ */
+typedef struct {
+    esp_vadn_iface_op_create_t create;
+    esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_vadn_iface_op_get_channel_num_t get_channel_num;
+    esp_vadn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_vadn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
+    esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
+    esp_vadn_iface_op_detect_t detect;
+    esp_vadn_iface_op_detect_mfcc_t detect_mfcc;
+    esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data;
+    esp_vadn_iface_op_clean_t clean;
+    esp_vadn_iface_op_destroy_t destroy;
+} esp_vadn_iface_t;
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c3/esp_vadn_models.h
+++ b/include/esp32c3/esp_vadn_models.h
@ -0,0 +1,22 @@
+#pragma once
+#include "esp_vadn_iface.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The prefix of vadnet model name is used to filter all wakenet from availabel models.
+#define ESP_VADN_PREFIX "vadnet"
+
+/**
+ * @brief Get the wakenet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of wakenet
+ */
+const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name);
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c3/esp_wn_iface.h
+++ b/include/esp32c3/esp_wn_iface.h
@ -29,6 +29,7 @@ typedef enum {
    DET_MODE_2CH_95 = 3,
    DET_MODE_3CH_90 = 4,
    DET_MODE_3CH_95 = 5,
+	DET_MODE_90_COPY_PARAMS = 6,       // Aggressive
 } det_mode_t;

 typedef struct {
@ -110,12 +111,21 @@ typedef char* (*esp_wn_iface_op_get_word_name_t)(model_iface_data_t *model, int
 * @brief Set the detection threshold to manually abjust the probability 
 *
 * @param model The model object to query
- * @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.5~0.9999
+ * @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.4~0.9999
 * @param word_index The index of wake word
 * @return 0: setting failed, 1: setting success
 */
 typedef int (*esp_wn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold, int word_index);

+/**
+ * @brief Reset the threshold to its initial state  
+ *
+ * @param model The model object to query
+ * @return 0: setting failed, 1: setting success
+ */
+typedef int (*esp_wn_iface_op_reset_det_threshold_t)(model_iface_data_t *model);
+
+
 /**
 * @brief Get the wake word detection threshold of different modes
 *
@ -200,6 +210,7 @@ typedef struct {
    esp_wn_iface_op_get_word_num_t get_word_num;
    esp_wn_iface_op_get_word_name_t get_word_name;
    esp_wn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_wn_iface_op_reset_det_threshold_t reset_det_threshold;
    esp_wn_iface_op_get_det_threshold_t get_det_threshold;
    esp_wn_iface_op_get_triggered_channel_t  get_triggered_channel;
    esp_wn_iface_op_get_vol_gain_t get_vol_gain;
--- a/include/esp32c3/esp_wn_models.h
+++ b/include/esp32c3/esp_wn_models.h
@ -11,7 +11,7 @@ extern "C" {
 /**
 * @brief Get the wakenet handle from model name
 *
- * @param model_name   The name of model 
+ * @param model_name   The name of model
 * @returns The handle of wakenet
 */
 const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
@ -19,10 +19,10 @@ const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
 /**
 * @brief Get the wake word name from model name
 *
- * @param model_name   The name of model 
+ * @param model_name   The name of model
 * @returns The wake word name, like "alexa","hilexin","xiaoaitongxue"
 */
-char* esp_wn_wakeword_from_name(const char *model_name);
+char *esp_wn_wakeword_from_name(const char *model_name);

 #ifdef __cplusplus
 }
--- a/include/esp32c3/flite_g2p.h
+++ b/include/esp32c3/flite_g2p.h
@ -0,0 +1,20 @@
+#ifndef __FLITE_G2P_H__
+#define __FLITE_G2P_H__
+
+typedef struct {
+    int num_phonemes;
+    int phoneme_size;
+    char **phonemes;
+} flite_g2p_result;
+
+void flite_g2p_result_free(flite_g2p_result *result);
+
+flite_g2p_result *flite_g2p_get_result(const char *grapheme);
+
+void flite_g2p_result_print_string(flite_g2p_result *result, int map_phonemes);
+
+char *flite_g2p_result_get_string(flite_g2p_result *result, int map_phonemes);
+
+char *flite_g2p(const char *graphemes, int map_phonemes);
+
+#endif
--- a/include/esp32c5/esp_afe_aec.h
+++ b/include/esp32c5/esp_afe_aec.h
@ -2,9 +2,8 @@
 #ifndef _ESP_AFE_AEC_H_
 #define _ESP_AFE_AEC_H_

-
-#include "esp_afe_config.h"
 #include "esp_aec.h"
+#include "esp_afe_config.h"

 #include <stdint.h>

@ -13,19 +12,19 @@ extern "C" {
 #endif

 typedef struct {
-    aec_handle_t* handle;
+    aec_handle_t *handle;
    aec_mode_t mode;
    afe_pcm_config_t pcm_config;
    int frame_size;
-    int16_t  *data;
-}afe_aec_handle_t;
-
+    int16_t *data;
+} afe_aec_handle_t;

 /**
- * @brief Creates an instance to the AEC structure. 
- * 
- * @warning Currently only support 1 microphone channel and 1 playback channe. 
- * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
+ * @brief Creates an instance to the AEC structure.
+ *
+ * @warning Currently only support 1 microphone channel and 1 playback channe.
+ * If input has multiple microphone channels and playback channels, just the first microphone channel and playback
+ * channel will be selected.
 *
 * The input format, same as afe config:
 * M to represent the microphone channel
@ -37,7 +36,8 @@ typedef struct {
 *
 * @param input_format     The input format
 * @param filter_length    The length of filter. The larger the filter, the higher the CPU loading.
- *                         Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
+ *                         Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for
+ * esp32c5.
 * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
 * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
 *
@ -45,17 +45,17 @@ typedef struct {
 */
 afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);

-
 /**
 * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
- * 
+ *
 * @param inst        The instance of AEC.
- * @param indata      Input audio data, format is define by input_format. Note indata will be modified in function call.
- * @param outdata     Returns near-end signal with echo removed. 
+ * @param indata      Input audio data, format is define by input_format.
+ * @param outdata     Near-end signal with echo removed.  outdata must be 16-bit aligned.
+ *                    please use heap_caps_aligned_calloc(16, n, size, caps) to allocate an aligned chunk of memory

 * @return The bytes of outdata.
 */
-size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
+size_t afe_aec_process(afe_aec_handle_t *handel, const int16_t *indata, int16_t *outdata);

 /**
 * @brief Get frame size of AEC (the samples of one frame)
@ -64,7 +64,6 @@ size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outda
 */
 int afe_aec_get_chunksize(afe_aec_handle_t *handle);

-
 /**
 * @brief Free the AEC instance
 *
--- a/include/esp32c5/esp_afe_config.h
+++ b/include/esp32c5/esp_afe_config.h
@ -1,9 +1,15 @@
 #pragma once
 #include "esp_aec.h"
+#include "esp_agc.h"
+#include "esp_nsn_models.h"
+#include "esp_vad.h"
+#include "esp_vadn_models.h"
+#include "esp_wn_iface.h"
+#include "esp_wn_models.h"
+#include "model_path.h"
 #include "stdbool.h"
 #include "stdint.h"
 #include "stdlib.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -27,7 +33,8 @@ typedef enum {
 // Set AFE type
 typedef enum {
    AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
-    AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
+    AFE_TYPE_VC = 1, // Voice communication scenarios, 16KHz input, including nonlinear noise suppression
+    AFE_TYPE_VC_8K = 2, // Voice communication scenarios, 8KHz input, note that the input data must be 8KHz
 } afe_type_t;

 typedef enum {
@ -62,8 +69,220 @@ typedef enum {
    AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
 } afe_agc_mode_t;

+/**
+ * @brief Function to get the debug audio data
+ *
+ * @param data        The debug audio data which don't be modify. It should be copied away as soon as possible that
+ * avoid blocking for too long.
+ * @param data_size   The number of bytes of data.
+ * @returns
+ */
+typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);
+
+typedef enum {
+    AFE_DEBUG_HOOK_MASE_TASK_IN = 0,  // To get the input data of mase task
+    AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
+    AFE_DEBUG_HOOK_MAX = 2
+} afe_debug_hook_type_t;
+
+typedef struct {
+    afe_debug_hook_type_t hook_type;         // debug type of hook
+    afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
+} afe_debug_hook_t;
+
+typedef struct {
+    /********** AEC(Acoustic Echo Cancellation) **********/
+    bool aec_init;         // Whether to init aec
+    aec_mode_t aec_mode;   // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
+    int aec_filter_length; // The filter length of aec
+
+    /********** SE(Speech Enhancement, microphone array processing) **********/
+    bool se_init; // Whether to init se
+
+    /********** NS(Noise Suppression) **********/
+    bool ns_init;              // Whether to init ns
+    char *ns_model_name;       // Model name of ns
+    afe_ns_mode_t afe_ns_mode; // Model mode of ns
+
+    /********** VAD(Voice Activity Detection) **********/
+    bool vad_init;          // Whether to init vad
+    vad_mode_t vad_mode;    // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
+    char *vad_model_name;   // The model name of vad, If it is null, WebRTC VAD will be used.
+    int vad_min_speech_ms;  // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
+    int vad_min_noise_ms;   // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
+                            // 1000 ms
+    int vad_delay_ms;       // The delay of the first speech frame in ms, default: 128 ms
+                            // If you find vad cache can not cover all speech, please increase this value.
+    bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
+    bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
+
+    /********** WakeNet(Wake Word Engine) **********/
+    bool wakenet_init;
+    char *wakenet_model_name;   // The model name of wakenet 1
+    char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
+    det_mode_t wakenet_mode;    // The mode of wakenet
+
+    /********** AGC(Automatic Gain Control) **********/
+    bool agc_init; // Whether to init agc
+    afe_agc_mode_t
+        agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
+    int agc_compression_gain_db; // Compression gain in dB (default 9)
+    int agc_target_level_dbfs;   // Target level in -dBfs of envelope (default 3, means target level is -3 dBFS)
+
+    /********** General AFE(Audio Front End) parameter **********/
+    afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
+    afe_mode_t afe_mode;         // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    afe_type_t afe_type;         // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    int afe_perferred_core;      // The preferred core of afe se task, which is created in afe_create function.
+    int afe_perferred_priority;  // The preferred priority of afe se task, which is created in afe_create function.
+    int afe_ringbuf_size;        // The ring buffer size: the number of frame data in ring buffer.
+    afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
+    float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
+                           // directly on the output amplitude: out_linear_gain * amplitude.
+    bool debug_init;
+    bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
+                              // otherwise, select channel number by wakenet
+} afe_config_t;
+
+/**
+ * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
+ * on the chip target and input format. You can manually fine-tune it after creating the configuration
+ *
+ * The input format:
+ * M to represent the microphone channel
+ * R to represent the playback reference channel
+ * N to represent an unknown or unused channel
+ *
+ * For example, input_format="MMNR" indicates that the input data consists of four channels,
+ * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
+ *
+ * @param input_format     The input format
+ * @param models           Models from partition, which is configured by Kconfig
+ * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
+ * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+ *
+ * @return afe_config_t*  The default config of afe
+ */
+afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
+
+/**
+ * @brief Check AFE configuration and make sure it is correct.
+ *
+ * @warning If there is a configuration conflict, this function will modify some parameters.
+ * The guiding behind these modifications is to maintain the highest performance of the output audio and results.
+ * And remove the conflict between different algorithms.
+ *
+ * For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
+ * If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
+ *
+ * @param afe_config       Input AFE config
+ *
+ * @return afe_config_t*  The modified AFE config
+ */
+afe_config_t *afe_config_check(afe_config_t *afe_config);
+
+/**
+ * @brief Parse input format
+ *
+ * @param input_format The input format, same with afe_config_init() function
+ * @param pcm_config   The pcm config
+ *
+ * @return true if the input format is parsed successfully, otherwise false
+ */
+bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);
+
+/**
+ * @brief Parse I2S input data
+ *
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param mic_data     The output microphone data
+ * @param ref_data     The output playback reference data
+ * @param pcm_config   The pcm config
+ *
+ */
+void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);
+
+/**
+ * @brief Parse input data, from interleaved arrangement to contiguous arrangement
+ *
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param channel_num  The channel number of data
+ * @param out_data     The output data
+ *
+ */
+void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
+
+/**
+ * @brief Format input data, from contiguous arrangement to interleaved arrangement
+ *
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param channel_num  The channel number of data
+ * @param out_data     The output data
+ *
+ */
+void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
+
+/**
+ * @brief Adjust the gain of input data
+ *
+ * @warning the input data will be modified inplace.
+ *
+ * @param data         The input audio data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param factor       The gain factor
+ *
+ * @return int16_t*    The output audio data
+ */
+int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);
+
+/**
+ * @brief Adjust the gain of input data
+ *
+ * @warning the input data will be modified inplace.
+ *
+ * @param in_data         The input audio data
+ * @param in_frame_size   Input data frame size of input
+ * @param channel_num     The channel number of input data, which is same as output data
+ * @param out_data        The output audio data
+ * @param out_frame_size  Onput data frame size of input
+ *
+ */
+void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);
+
+/**
+ * @brief Copy the afe config
+ *
+ * @param dst_config    The destination afe config
+ * @param src_config    The source afe config
+ *
+ * @return   The destination afe config
+ */
+afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
+
+/**
+ * @brief Print the afe config
+ *
+ * @param afe_config    The afe config
+ */
+void afe_config_print(const afe_config_t *afe_config);
+
+/**
+ * @brief Allocate afe config
+ *
+ * @return The afe config pointer
+ */
+afe_config_t *afe_config_alloc();
+
+/**
+ * @brief Free afe config
+ *
+ * @param afe_config  The afe config pointer
+ */
+void afe_config_free(afe_config_t *afe_config);

 #ifdef __cplusplus
 }
 #endif
-
--- a/include/esp32c5/esp_afe_doa.h
+++ b/include/esp32c5/esp_afe_doa.h
@ -0,0 +1,48 @@
+#ifndef _ESP_AFE_DOA_H_
+#define _ESP_AFE_DOA_H_
+
+#include "esp_doa.h"
+#include "esp_afe_config.h"
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    doa_handle_t *doa_handle;
+    afe_pcm_config_t pcm_config;
+    int16_t *leftdata;
+    int16_t *rightdata;
+    int frame_size;
+} afe_doa_handle_t;
+
+/**
+ * @brief Initialize SRP-PHAT processor
+ * @param input_format     The input format
+ * @param fs Sampling rate (Hz), e.g., 16000
+ * @param resolution Angular search resolution (degrees), e.g., 20
+ * @param d_mics Microphone spacing (meters), e.g., 0.06
+ * @param input_timedate_samples input timedate samples, e.g., 1024
+ * @return Initialized doa_handle_t object pointer, Recommend using the above configuration for better performance
+ */
+afe_doa_handle_t *afe_doa_create(const char *input_format, int fs, float resolution, float d_mics, int input_timedate_samples);
+/**
+ * @brief Process audio frame for direction estimation
+ * @param handle doa_handle_t instance pointer
+ * @param indata Input audio data, format is define by input_format.
+ * @return Estimated sound direction in degrees, e.g., 0-180
+ */
+float afe_doa_process(afe_doa_handle_t *handle, const int16_t *indata);
+/**
+ * @brief Release all allocated resources
+ * @param doa doa_handle_t instance pointer to be freed
+ */
+void afe_doa_destroy(afe_doa_handle_t *handle);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ESP_AFE_DOA_H_ */
--- a/include/esp32c5/esp_afe_sr_iface.h
+++ b/include/esp32c5/esp_afe_sr_iface.h
@ -0,0 +1,237 @@
+#pragma once
+#include "esp_afe_config.h"
+#include "stdbool.h"
+#include "stdint.h"
+#include "stdlib.h"
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// AFE: Audio Front-End
+// SR:  Speech Recognition
+// afe_sr/AFE_SR: the audio front-end for speech recognition
+
+// Opaque AFE_SR data container
+typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
+
+/**
+ * @brief The state of vad
+ */
+typedef enum {
+    AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
+    AFE_VAD_SPEECH = 1   // Deprecated, please use vad_state_t, speech
+} afe_vad_state_t;
+
+/**
+ * @brief The result of fetch function
+ */
+typedef struct afe_fetch_result_t {
+    int16_t *data;      // the target channel data of audio.
+    int data_size;      // the size of data. The unit is byte.
+    int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
+                        // audio that was truncated.
+    int vad_cache_size; // the size of vad_cache. The unit is byte.
+    float data_volume;  // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
+                        // (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
+                        // wakenet(about 1.5s), otherwise is the frame length.
+    wakenet_state_t wakeup_state; // the value is wakenet_state_t
+    int wake_word_index;          // if the wake word is detected. It will store the wake word index which start from 1.
+    int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
+                             // start from 1.
+    vad_state_t vad_state;   // the value is afe_vad_state_t
+    int trigger_channel_id;  // the channel index of output
+    int wake_word_length;    // the length of wake word. The unit is the number of samples.
+    int ret_value;           // the return state of fetch function
+    int16_t *raw_data;       // the multi-channel output data of audio.
+    int raw_data_channels;   // the channel number of raw data
+    float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is buzy. 
+    void *reserved;          // reserved for future use
+} afe_fetch_result_t;
+
+/**
+ * @brief Function to initialze a AFE_SR instance
+ *
+ * @param afe_config        The config of AFE_SR
+ * @returns Handle to the AFE_SR data
+ */
+typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
+
+/**
+ * @brief Get the amount of each channel samples per frame that need to be passed to the function
+ *
+ * Every speech enhancement AFE_SR processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param afe The AFE_SR object to query
+ * @return The amount of samples to feed the fetch function
+ */
+typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Get the channel number
+ *
+ * @param afe   The AFE_SR object to query
+ * @return      The amount of total channels
+ */
+typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the function
+ *
+ * @param afe   The AFE_SR object to query
+ * @return      The sample rate, in hz
+ */
+typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Feed samples of an audio stream to the AFE_SR
+ *
+ * @Warning  The input data should be arranged in the format of channel interleaving.
+ *           The last channel is reference signal if it has reference data.
+ *
+ * @param afe   The AFE_SR object to query
+ *
+ * @param in    The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
+ *              `get_feed_chunksize`.
+ * @return      The size of input
+ */
+typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);
+
+/**
+ * @brief fetch enhanced samples of an audio stream from the AFE_SR
+ *
+ * @Warning  The output is single channel data, no matter how many channels the input is.
+ *           Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
+ *
+ * @param afe   The AFE_SR object to query
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
+ * audio can be queried by the `get_fetch_chunksize`.)
+ */
+typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
+ *
+ * @Warning  The output is single channel data, no matter how many channels the input is.
+ *
+ * @param afe            The AFE_SR object to query
+ * @param ticks_to_wait  The timeout value, in ticks, to wait for the fetch result.
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
+ * audio can be queried by the `get_fetch_chunksize`.)
+ */
+typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
+
+/**
+ * @brief reset ringbuf of AFE.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             -1: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Set wakenet detection threshold 
+ * 
+ * @param afe           The AFE_SR object to query
+ * @param index         The wakenet index, just support 1: wakenet1 or  2: wakenet2
+ * @param threshold     The wakenet detection threshold, the value is between 0.4 and 0.9999.
+ * @return             -1: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_set_wakenet_threshold_t)(esp_afe_sr_data_t *afe, int index, float threshold);
+
+/**
+ * @brief Reset wakenet detection threshold to inital state
+ * 
+ * @param afe           The AFE_SR object to query
+ * @param index         The wakenet index, just support 1: wakenet1 or  2: wakenet2
+ * @return             -1: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_reset_wakenet_threshold_t)(esp_afe_sr_data_t *afe, int index);
+
+/**
+ * @brief Reset one function/module/algorithm.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             -1: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Disable one function/module/algorithm.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             -1: fail, 0: disabled, 1: enabled
+ */
+typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Enable one function/module/algorithm.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             -1: fail, 0: disabled, 1: enabled
+ */
+typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Print all functions/modules/algorithms pipeline.
+ *       The pipeline is the order of the functions/modules/algorithms.
+ *       The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output]
+ *
+ * @param afe          The AFE_SR object to query
+ */
+typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Destroy a AFE_SR instance
+ *
+ * @param afe         AFE_SR object to destroy
+ */
+typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * This structure contains the functions used to do operations on a AFE_SR.
+ */
+typedef struct {
+    esp_afe_sr_iface_op_create_from_config_t create_from_config;
+    esp_afe_sr_iface_op_feed_t feed;
+    esp_afe_sr_iface_op_fetch_t fetch;
+    esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay;
+    esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
+    esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
+    esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
+    esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
+    esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
+    esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
+    esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
+    esp_afe_sr_iface_op_set_wakenet_threshold_t set_wakenet_threshold;
+    esp_afe_sr_iface_op_reset_wakenet_threshold_t reset_wakenet_threshold;
+    esp_afe_sr_iface_op_disable_func_t disable_wakenet;
+    esp_afe_sr_iface_op_enable_func_t enable_wakenet;
+    esp_afe_sr_iface_op_disable_func_t disable_aec;
+    esp_afe_sr_iface_op_enable_func_t enable_aec;
+    esp_afe_sr_iface_op_disable_func_t disable_se;
+    esp_afe_sr_iface_op_enable_func_t enable_se;
+    esp_afe_sr_iface_op_disable_func_t disable_vad;
+    esp_afe_sr_iface_op_enable_func_t enable_vad;
+    esp_afe_sr_iface_op_reset_op_t reset_vad;
+    esp_afe_sr_iface_op_disable_func_t disable_ns;
+    esp_afe_sr_iface_op_enable_func_t enable_ns;
+    esp_afe_sr_iface_op_disable_func_t disable_agc;
+    esp_afe_sr_iface_op_enable_func_t enable_agc;
+    esp_afe_sr_iface_op_print_pipeline_t print_pipeline;
+    esp_afe_sr_iface_op_destroy_t destroy;
+} esp_afe_sr_iface_t;
+
+// struct is used to store the AFE handle and data for the AFE task
+typedef struct {
+    esp_afe_sr_data_t *afe_data;
+    esp_afe_sr_iface_t *afe_handle;
+    TaskHandle_t feed_task;
+    TaskHandle_t fetch_task;
+} afe_task_into_t;
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c5/esp_afe_sr_models.h
+++ b/include/esp32c5/esp_afe_sr_models.h
@ -0,0 +1,13 @@
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "esp_afe_sr_iface.h"
+
+esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c5/esp_agc.h
+++ b/include/esp32c5/esp_agc.h
@ -0,0 +1,47 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_AGC_H_
+#define _ESP_AGC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+////all positive value is valid, negective is error
+typedef enum {
+    ESP_AGC_SUCCESS = 0,   ////success
+    ESP_AGC_FAIL = -1, ////agc fail
+    ESP_AGC_SAMPLE_RATE_ERROR = -2,  ///sample rate can be only 8khz, 16khz, 32khz
+    ESP_AGC_FRAME_SIZE_ERROR = -3,   ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size
+} ESP_AGE_ERR;
+
+typedef enum {
+    AGC_MODE_SR = -1,      // Bypass WEBRTC AGC
+    AGC_MODE_0 = 0,        // Only saturation protection
+    AGC_MODE_1 = 1,        // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
+    AGC_MODE_2 = 2,        // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
+    AGC_MODE_3 = 3,        // Fixed Digital Gain [compressionGaindB (default 8 dB)]
+} agc_mode_t;
+
+void *esp_agc_open(agc_mode_t agc_mode, int sample_rate);
+void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs);
+int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate);
+void esp_agc_close(void *agc_handle);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _ESP_AGC_H_
--- a/include/esp32c5/esp_doa.h
+++ b/include/esp32c5/esp_doa.h
@ -0,0 +1,41 @@
+#ifndef _ESP_DOA_H_
+#define _ESP_DOA_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct doa_handle_t doa_handle_t;
+/**
+ * @brief Initialize SRP-PHAT processor
+ * @param fs Sampling rate (Hz), e.g., 16000
+ * @param resolution Angular search resolution (degrees), e.g., 20
+ * @param d_mics Microphone spacing (meters), e.g., 0.06
+ * @param input_timedate_samples input timedate samples, e.g., 1024
+ * @return Initialized doa_handle_t object pointer, Recommend using the above configuration for better performance
+ */
+doa_handle_t *esp_doa_create(int fs, float resolution, float d_mics, int input_timedate_samples);
+
+/**
+ * @brief Release all allocated resources
+ * @param doa doa_handle_t instance pointer to be freed
+ */
+void esp_doa_destroy(doa_handle_t *doa);
+
+/**
+ * @brief Process audio frame for direction estimation
+ * @param doa doa_handle_t instance pointer
+ * @param left Left channel 16-bit PCM data
+ * @param right Right channel 16-bit PCM data
+ * @return Estimated sound direction in degrees, e.g., 0-180
+ */
+float esp_doa_process(doa_handle_t *doa, int16_t* left, int16_t* right);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ESP_DOA_H_ */
--- a/include/esp32c5/esp_mase.h
+++ b/include/esp32c5/esp_mase.h
@ -0,0 +1,93 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_MASE_H_
+#define _ESP_MASE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MASE_SAMPLE_RATE 16000        // Supports 16kHz only
+#define MASE_FRAME_SIZE 16            // Supports 16ms only
+#define MASE_MIC_DISTANCE 65          // According to physical design of mic-array
+
+/**
+ * @brief Sets mic-array type, currently 2-mic line array and 3-mic circular array 
+ * are supported.
+ */
+typedef enum {
+    TWO_MIC_LINE = 0,
+    THREE_MIC_CIRCLE = 1
+} mase_mic_array_type_t;
+
+/**
+ * @brief Sets operating mode, supporting normal mode and wake-up enhancement mode
+ */
+typedef enum {
+    NORMAL_ENHANCEMENT_MODE = 0,
+    WAKE_UP_ENHANCEMENT_MODE = 1
+} mase_op_mode_t;
+
+typedef void* mase_handle_t;
+
+/**
+ * @brief Creates an instance to the MASE structure.
+ *
+ * @param sample_rate       The sampling frequency (Hz) must be 16000.
+ *
+ * @param frame_size        The length of the audio processing must be 16ms.
+ *
+ * @param array_type        '0' for 2-mic line array and '1' for 3-mic circular array.
+ *
+ * @param mic_distance      The distance between neiboring microphones in mm.
+ *
+ * @param operating_mode	'0' for normal mode and '1' for wake-up enhanced mode.
+ *
+ * @param filter_strength	Strengh of the mic-array speech enhancement, must be 0, 1, 2 or 3.
+ * 
+ * @return
+ *         - NULL: Create failed
+ *         - Others: An instance of MASE
+ */
+mase_handle_t mase_create(int fs, int frame_size, int array_type, float mic_distance, int operating_mode, int filter_strength);
+
+/**
+ * @brief Performs mic array processing for one frame.
+ *
+ * @param inst        The instance of MASE.
+ *
+ * @param in          An array of 16-bit signed audio samples from mic.
+ *
+ * @param dsp_out     Returns enhanced signal.
+ *
+ * @return None
+ *
+ */
+void mase_process(mase_handle_t st, int16_t *in, int16_t *dsp_out);
+
+/**
+ * @brief Free the MASE instance
+ *
+ * @param inst The instance of MASE.
+ *
+ * @return None
+ *
+ */
+void mase_destory(mase_handle_t st);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/include/esp32c5/esp_mfcc_models.h
+++ b/include/esp32c5/esp_mfcc_models.h
@ -12,7 +12,7 @@ esp_mfcc_opts_t *get_mfcc_opts_wn9();
 /**
 * @brief Return basic opts used in wakenet9s
 **/
-esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
+esp_mfcc_opts_t *get_mfcc_opts(const char *win_type, bool use_power, int winstep_ms, int winlen_ms, int nfilter);

 /**
 * @brief Return basic opts for default kaldifeat
--- a/include/esp32c5/esp_mn_iface.h
+++ b/include/esp32c5/esp_mn_iface.h
@ -0,0 +1,223 @@
+#pragma once
+#include "stdint.h"
+#include "esp_wn_iface.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ESP_MN_RESULT_MAX_NUM 5
+#define ESP_MN_MAX_PHRASE_NUM 400
+#define ESP_MN_MAX_PHRASE_LEN 63
+#define ESP_MN_MIN_PHRASE_LEN 2
+
+#define ESP_MN_PREFIX "mn"
+#define ESP_MN_ENGLISH "en"
+#define ESP_MN_CHINESE "cn"
+
+typedef enum {
+    ESP_MN_STATE_DETECTING = 0,     // detecting
+    ESP_MN_STATE_DETECTED = 1,      // detected
+    ESP_MN_STATE_TIMEOUT = 2,       // time out
+} esp_mn_state_t;
+
+//Set multinet loading mode
+//The memory comsumption is decreased with increasing mode,
+//As a consequence also the CPU loading rate goes up
+typedef enum {
+    ESP_MN_LOAD_FROM_PSRAM = 0,          // Load all weights from PSRAM. Fastest computation with Maximum memory consumption
+    ESP_MN_LOAD_FROM_PSRAM_FLASH = 1,    // Load some weights from PSRAM and laod the rest from FLASH (default)
+    ESP_MN_LOAD_FROM_FLASH = 2,          // Load more weights from FLASH. Minimum memory consumption with slowest computation
+} esp_mn_loader_mode_t;
+
+typedef enum {
+    ESP_MN_GREEDY_SEARCH = 0,          // greedy search
+    ESP_MN_BEAM_SEARCH = 1,            // beam search
+    ESP_MN_BEAM_SEARCH_WITH_FST = 2,  // beam search with trie language model
+} esp_mn_search_method_t;
+
+typedef enum {
+    CHINESE_ID = 1,       // Chinese language
+    ENGLISH_ID = 2,       // English language
+} language_id_t;
+
+// Return all possible recognition results
+typedef struct{
+    esp_mn_state_t state;
+    int num;                                   // The number of phrase in list, num<=5. When num=0, no phrase is recognized.
+    int command_id[ESP_MN_RESULT_MAX_NUM];     // The list of command id.
+    int phrase_id[ESP_MN_RESULT_MAX_NUM];      // The list of phrase id.
+    float prob[ESP_MN_RESULT_MAX_NUM];         // The list of probability.
+    char string[256];
+} esp_mn_results_t;
+
+typedef struct {
+    char *string;                               // command string
+    char *phonemes;                             // command phonemes, if applicable
+    int16_t command_id;                         // the command id
+    float threshold;                            // trigger threshold, default: 0
+    int16_t *wave;                              // prompt wave data of the phrase
+} esp_mn_phrase_t;
+
+typedef struct _mn_node_ {
+    esp_mn_phrase_t *phrase;
+    struct _mn_node_ *next;
+} esp_mn_node_t;
+
+typedef struct{
+    int16_t num;                                // The number of error phrases, which can not added into model
+    esp_mn_phrase_t **phrases;                  // The array of error phrase pointer
+} esp_mn_error_t;
+
+/**
+ * @brief Initialze a model instance with specified model name.
+ *
+ * @param model_name  The wakenet model name.
+ * @param duration    The duration (ms) to trigger the timeout
+ *
+ * @returns Handle to the model data.
+ */
+typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const char *model_name, int duration);
+
+/**
+ * @brief Switch multinet mode to change memory consumption and CPU loading
+ *
+ * @warning Just Support multinet6 or later versions
+ *
+ * @param model The model object to query
+ * @param mode  The multinet loader mode
+ *
+ * @returns Handle to the model data.
+ */
+typedef model_iface_data_t* (*esp_mn_iface_op_switch_loader_mode_t)(model_iface_data_t *model, esp_mn_loader_mode_t mode);
+
+/**
+ * @brief Callback function type to fetch the amount of samples that need to be passed to the detect function
+ *
+ * Every speech recognition model processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model       The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_mn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
+
+/**
+ * @brief Callback function type to fetch the number of frames recognized by the command word
+ *
+ * @param model       The model object to query
+ * @return The number of the frames recognized by the command word
+ */
+typedef int (*esp_mn_iface_op_get_samp_chunknum_t)(model_iface_data_t *model);
+
+/**
+ * @brief Set the detection threshold to manually abjust the probability
+ *
+ * @param model The model object to query
+ * @param det_treshold The threshold to trigger speech commands, the range of det_threshold is 0.0~0.9999
+ */
+typedef int (*esp_mn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the detect function
+ *
+ * @param model       The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_mn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the language of model
+ *
+ * @param model       The language name
+ * @return Language name string defined in esp_mn_models.h, eg: ESP_MN_CHINESE, ESP_MN_ENGLISH
+ */
+typedef char * (*esp_mn_iface_op_get_language_t)(model_iface_data_t *model);
+
+/**
+ * @brief Feed samples of an audio stream to the speech recognition model and detect if there is a speech command found.
+ *
+ * @param model       The model object to query.
+ * @param samples     An array of 16-bit signed audio samples. The array size used can be queried by the
+ *                    get_samp_chunksize function.
+ * @return The state of multinet
+ */
+typedef esp_mn_state_t (*esp_mn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
+
+/**
+ * @brief Destroy a speech commands recognition model
+ *
+ * @param model       The Model object to destroy
+ */
+typedef void (*esp_mn_iface_op_destroy_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get recognition results
+ *
+ * @param model       The Model object to query
+ *
+ * @return The current results.
+ */
+typedef esp_mn_results_t* (*esp_mn_iface_op_get_results_t)(model_iface_data_t *model);
+
+/**
+ * @brief Open the log print
+ *
+ * @param model_data       The model object to query.
+ *
+ */
+typedef void (*esp_mn_iface_op_open_log_t)(model_iface_data_t *model_data);
+
+/**
+ * @brief Clean all status of model
+ *
+ * @param model_data       The model object to query.
+ *
+ */
+typedef void (*esp_mn_iface_op_clean_t)(model_iface_data_t *model_data);
+
+/**
+ * @brief Set the speech commands by mn_command_root
+ *
+ * @param model_data       The model object to query.
+ * @param mn_command_root  The speech commands link.
+ * @return The error phrase id info.
+ */
+typedef esp_mn_error_t* (*esp_wn_iface_op_set_speech_commands)(model_iface_data_t *model_data, esp_mn_node_t *mn_command_root);
+
+
+/**
+ * @brief Print out current commands in fst, note the ones "added" but not "updated" will not be shown here
+ *
+ * @param model_data     The model object to query
+*/
+typedef void (*esp_mn_iface_op_print_active_speech_commands)(model_iface_data_t *model_data);
+
+/**
+ * @brief Check if input string can be tokenized
+ *
+ * @param model_data     The model object to query
+ * @param str            The input string
+*/
+typedef int (*esp_mn_iface_op_check_speech_command)(model_iface_data_t *model_data, const char *str);
+
+typedef struct {
+    esp_mn_iface_op_create_t create;
+    esp_mn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_mn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_mn_iface_op_get_samp_chunknum_t get_samp_chunknum;
+    esp_mn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_mn_iface_op_get_language_t get_language;
+    esp_mn_iface_op_detect_t detect;
+    esp_mn_iface_op_destroy_t destroy;
+    esp_mn_iface_op_get_results_t get_results;
+    esp_mn_iface_op_open_log_t open_log;
+    esp_mn_iface_op_clean_t clean;
+    esp_wn_iface_op_set_speech_commands set_speech_commands;
+    esp_mn_iface_op_switch_loader_mode_t switch_loader_mode;
+    esp_mn_iface_op_print_active_speech_commands print_active_speech_commands;
+    esp_mn_iface_op_check_speech_command check_speech_command;
+} esp_mn_iface_t;
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c5/esp_mn_models.h
+++ b/include/esp32c5/esp_mn_models.h
@ -0,0 +1,66 @@
+#pragma once
+#include "esp_mn_iface.h"
+
+//Contains declarations of all available speech recognion models. Pair this up with the right coefficients and you have a model that can recognize
+//a specific phrase or word.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * @brief Get the multinet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of multinet
+ */
+esp_mn_iface_t *esp_mn_handle_from_name(char *model_name);
+
+/**
+ * @brief Get the multinet language from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The language of multinet
+ */
+char *esp_mn_language_from_name(char *model_name);
+
+/*
+ Configure wake word to use based on what's selected in menuconfig.
+*/
+
+#ifdef CONFIG_SR_MN_CN_MULTINET2_SINGLE_RECOGNITION
+#include "multinet2_ch.h"
+#define MULTINET_COEFF get_coeff_multinet2_ch
+#define MULTINET_MODEL_NAME "mn2_cn"
+
+#else
+#define MULTINET_COEFF      "COEFF_NULL"
+#define MULTINET_MODEL_NAME "NULL"
+#endif
+
+
+/* example
+
+static const esp_mn_iface_t *multinet = &MULTINET_MODEL;
+
+//Initialize MultiNet model data
+model_iface_data_t *model_data = multinet->create(&MULTINET_COEFF);
+add_speech_commands(multinet, model_data);
+
+//Set parameters of buffer
+int audio_chunksize=model->get_samp_chunksize(model_data);
+int frequency = model->get_samp_rate(model_data);
+int16_t *buffer=malloc(audio_chunksize*sizeof(int16_t));
+
+//Detect
+int r=model->detect(model_data, buffer);
+if (r>0) {
+    printf("Detection triggered output %d.\n",  r);
+}
+
+//Destroy model
+model->destroy(model_data)
+
+*/
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c5/esp_ns.h
+++ b/include/esp32c5/esp_ns.h
@ -0,0 +1,86 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_NS_H_
+#define _ESP_NS_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NS_USE_SPIARM       0
+#define NS_FRAME_LENGTH_MS     10          //Supports 10ms, 20ms, 30ms
+
+/**
+* The Sampling frequency (Hz) must be 16000Hz
+*/
+
+typedef void* ns_handle_t;
+
+/**
+ * @brief Creates an instance to the NS structure.
+ *
+ * @param frame_length   The length of the audio processing can be 10ms, 20ms, 30ms.
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of NS
+ */
+ns_handle_t ns_create(int frame_length);
+
+/**
+ * @brief Creates an instance of the more powerful noise suppression algorithm.
+ * 
+ * @warning frame_length only supports be 10 ms.
+ *
+ * @param frame_length    The length of the audio processing can only be 10ms.
+ * @param mode            0: Mild, 1: Medium, 2: Aggressive
+ * @param sample_rate     The sample rate of the audio. 
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of NS
+ */
+ns_handle_t ns_pro_create(int frame_length, int mode, int sample_rate);
+
+/**
+ * @brief Feed samples of an audio stream to the NS and get the audio stream after Noise suppression.
+ *
+ * @param inst        The instance of NS.
+ *
+ * @param indata      An array of 16-bit signed audio samples.
+ *
+ * @param outdata     An array of 16-bit signed audio samples after noise suppression.
+ *
+ * @return None
+ *
+ */
+void ns_process(ns_handle_t inst, int16_t *indata, int16_t *outdata);
+
+/**
+ * @brief Free the NS instance
+ *
+ * @param inst The instance of NS.
+ *
+ * @return None
+ *
+ */
+void ns_destroy(ns_handle_t inst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_NS_H_
--- a/include/esp32c5/esp_nsn_iface.h
+++ b/include/esp32c5/esp_nsn_iface.h
@ -0,0 +1,64 @@
+#pragma once
+#include "stdint.h"
+
+//Opaque model data container
+typedef struct esp_nsn_data_t esp_nsn_data_t;
+
+
+/**
+ * @brief Easy function type to initialze a model instance
+ *
+ * @param model_name The name of the model instance
+ * @returns Handle to the model data
+ */
+typedef esp_nsn_data_t* (*esp_nsn_iface_op_create_t)(char *model_name);
+
+/**
+ * @brief Get the amount of samples that need to be passed to the process function
+ *
+ * Every noise suppression model processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the process function
+ */
+typedef int (*esp_nsn_iface_op_get_samp_chunksize_t)(esp_nsn_data_t *model);
+
+/**
+ * @brief Feed samples of an audio stream to the noise suppression model and get data after process.
+ *
+ *
+ * @param model The model object to query
+ * @param in_data An array of 16-bit signed audio samples. The array size used can be queried by the 
+ *        get_samp_chunksize function.
+ * @param out_data An array of 16-bit signed audio samples after process.
+ * @return The state of return.
+ */
+typedef int (*esp_nsn_iface_op_process_t)(esp_nsn_data_t *model, int16_t *in_data, int16_t *out_data);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the process function
+ *
+ * @param model The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_nsn_iface_op_get_samp_rate_t)(esp_nsn_data_t *model);
+
+/**
+ * @brief Destroy a noise suppression model
+ *
+ * @param model Model object to destroy
+ */
+typedef void (*esp_nsn_iface_op_destroy_t)(esp_nsn_data_t *model);
+
+
+/**
+ * This structure contains the functions used to do operations on a wake word detection model.
+ */
+typedef struct {
+    esp_nsn_iface_op_create_t create;
+    esp_nsn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_nsn_iface_op_process_t process;
+    esp_nsn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_nsn_iface_op_destroy_t destroy;
+} esp_nsn_iface_t;
--- a/include/esp32c5/esp_nsn_models.h
+++ b/include/esp32c5/esp_nsn_models.h
@ -0,0 +1,17 @@
+#pragma once
+
+#include "esp_nsn_iface.h"
+
+/*
+The prefix of nset
+Now there are nsnet1 and nsnet2
+*/
+#define ESP_NSNET_PREFIX "nsnet"
+
+/**
+ * @brief Get the nsnet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of multinet
+ */
+esp_nsn_iface_t *esp_nsnet_handle_from_name(char *model_name);
--- a/include/esp32c5/esp_speech_features.h
+++ b/include/esp32c5/esp_speech_features.h
@ -48,7 +48,7 @@ float *esp_win_func_init(char *win_type, float *window_data, int frame_length);

 float *esp_fftr(float *x, int nfft, void *fft_table);

-float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
+float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_handle);

 void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);

--- a/include/esp32c5/esp_sr_webrtc.h
+++ b/include/esp32c5/esp_sr_webrtc.h
@ -0,0 +1,84 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_WEBRTC_H_
+#define _ESP_WEBRTC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "esp_agc.h"
+#include "esp_log.h"
+#include "esp_ns.h"
+#include "sr_ringbuf.h"
+#include <stdint.h>
+
+#include "esp_heap_caps.h"
+
+typedef struct {
+    void *ns_handle;
+    void *agc_handle;
+    int frame_size;
+    int sample_rate;
+    int16_t *buff;
+    int16_t *out_data;
+    sr_ringbuf_handle_t rb;
+} webrtc_handle_t;
+
+/**
+ * @brief Creates an instance of webrtc.
+ *
+ * @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms.
+ *
+ * @param frame_length_ms    The length of the audio processing
+ * @param ns_mode            The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive
+ * @param agc_mode           The model of AGC
+ * @param agc_gain           The gain of AGC. default is 9
+ * @param agc_target_level   The target level of AGC. default is -3 dbfs
+ * @param sample_rate        The sample rate of the audio.
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of webrtc
+ */
+webrtc_handle_t *webrtc_create(
+    int frame_length_ms, int ns_mode, agc_mode_t agc_mode, int agc_gain, int agc_target_level, int sample_rate);
+
+/**
+ * @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression.
+ *
+ * @param handle        The instance of NS.
+ * @param in_data       An array of 16-bit signed audio samples.
+ * @param out_size      The sample size of output data
+ * @param enable_ns     Enable noise suppression
+ * @param enable_agc    Enable automatic gain control
+ *
+ * @return data after noise suppression
+ */
+int16_t *webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc);
+
+/**
+ * @brief Free the webrtc instance
+ *
+ * @param handle The instance of webrtc.
+ *
+ * @return None
+ *
+ */
+void webrtc_destroy(webrtc_handle_t *handle);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_NS_H_
--- a/include/esp32c5/esp_vad.h
+++ b/include/esp32c5/esp_vad.h
@ -0,0 +1,178 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_VAD_H_
+#define _ESP_VAD_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SAMPLE_RATE_HZ 16000   // Supports 32000, 16000, 8000
+#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms
+
+/**
+ * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
+ * restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
+ */
+typedef enum {
+    VAD_MODE_0 = 0, // Normal
+    VAD_MODE_1,     // Aggressive
+    VAD_MODE_2,     // Very Aggressive
+    VAD_MODE_3,     // Very Very Aggressive
+    VAD_MODE_4      // Very Very Very Aggressive
+} vad_mode_t;
+
+typedef enum {
+    VAD_SILENCE = 0,
+    VAD_SPEECH = 1,
+} vad_state_t;
+
+typedef struct vad_trigger_tag {
+    vad_state_t state;
+    unsigned int min_speech_len;
+    unsigned int noise_len;
+    unsigned int min_noise_len;
+    unsigned int speech_len;
+} vad_trigger_t;
+
+#define vad_MAX_LEN INT32_MAX - 1
+/**
+ * @brief Allocate wakenet trigger
+ *
+ * @param min_speech_len  Minimum frame number of speech duration
+ * @param min_noise_len   Minimum frame number of noise duration
+ *
+ * @return Trigger pointer
+ **/
+vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
+
+/**
+ * @brief Free wakenet trigger
+ **/
+void vad_trigger_free(vad_trigger_t *trigger);
+
+/**
+ * @brief Reset wakenet trigger
+ **/
+void vad_trigger_reset(vad_trigger_t *trigger);
+
+/**
+ * @brief detect activaty voice by trigger
+ **/
+vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
+
+typedef struct {
+    vad_trigger_t *trigger;
+    void *vad_inst;
+    int sample_rate;
+    int frame_size;
+} vad_handle_with_trigger_t;
+
+typedef vad_handle_with_trigger_t *vad_handle_t;
+
+// typedef vad_handle_tag * vad_handle_t;
+
+/**
+ * @brief Creates an instance to the VAD structure.
+ *
+ * @param vad_mode          Sets the VAD operating mode.
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of VAD
+ */
+vad_handle_t vad_create(vad_mode_t vad_mode);
+
+/**
+ * @brief Creates an instance to the VAD structure.
+ *
+ * @param vad_mode          Sets the VAD operating mode.
+ * @param sample_rate       Sample rate in Hz
+ * @param one_frame_ms      Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30.
+ * @param min_speech_ms     Minimum speech duration, unit is ms
+ * @param min_noise_ms      Minimum noise duration, unit is ms
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of VAD
+ */
+vad_handle_t vad_create_with_param(
+    vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
+
+/**
+ * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
+ *
+ * @param handle            The instance of VAD.
+ * @param data              An array of 16-bit signed audio samples.
+ * @param sample_rate_hz    The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
+ * @param one_frame_ms      The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
+ * @return
+ *         - VAD_SILENCE if no voice
+ *         - VAD_SPEECH  if voice is detected
+ *
+ */
+vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms);
+
+/**
+ * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
+ *
+ * @param handle            The instance of VAD.
+ * @param data              An array of 16-bit signed audio samples.
+ * @return
+ *         - VAD_SILENCE if no voice
+ *         - VAD_SPEECH  if voice is detected
+ *
+ */
+vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
+
+/**
+ * @brief Reset trigger state as Silence
+ *
+ * @param handle            The instance of VAD.
+ */
+void vad_reset_trigger(vad_handle_t handle);
+
+/**
+ * @brief Free the VAD instance
+ *
+ * @param inst The instance of VAD.
+ *
+ * @return None
+ *
+ */
+void vad_destroy(vad_handle_t inst);
+
+/*
+ * Programming Guide:
+ *
+ * @code{c}
+ * vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);     // Creates an instance to
+ * the VAD structure.
+ *
+ * while (1) {
+ *    //Use buffer to receive the audio data from MIC.
+ *    vad_state_t vad_state = vad_process(vad_inst, buffer);      // Feed samples to the VAD process and get the result.
+ * }
+ *
+ * vad_destroy(vad_inst);   // Free the VAD instance at the end of whole VAD process
+ *
+ * @endcode
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_VAD_H_
--- a/include/esp32c5/esp_vadn_iface.h
+++ b/include/esp32c5/esp_vadn_iface.h
@ -0,0 +1,164 @@
+#pragma once
+#include "esp_vad.h"
+#include "stdint.h"
+#include "dl_lib_convq_queue.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque model data container
+typedef struct model_iface_data_t model_iface_data_t;
+
+// /**
+//  * @brief The state of vad
+//  */
+// typedef enum {
+//     VAD_NOISE = -1,  // Noise
+//     VADNET_STATE_SILENCE = 0, // Silence
+//     VAD_SPEECH = 1   // Speech
+// } vad_state_t;
+
+/**
+ * @brief Easy function type to initialze a model instance with a detection mode
+ * and specified model name
+ *
+ * @param model_name  The specified model name
+ * @param mode        The voice activity detection mode
+ * @param channel_num The number of input audio channels
+ * @param min_speech_ms  The minimum duration of speech in ms to trigger vad
+ * speech
+ * @param min_noise_ms   The minimum duration of noise in ms to trigger vad
+ * noise
+ * @returns Handle to the model data
+ */
+typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)(
+    const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms);
+
+/**
+ * @brief Get the amount of samples that need to be passed to the detect
+ * function
+ *
+ * Every speech recognition model processes a certain number of samples at the
+ * same time. This function can be used to query that amount. Note that the
+ * returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the channel number of samples that need to be passed to the detect
+ * function
+ *
+ * Every speech recognition model processes a certain number of samples at the
+ * same time. This function can be used to query that amount. Note that the
+ * returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the detect function
+ *
+ * @param model The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
+
+/**
+ * @brief Set the detection threshold to manually abjust the probability
+ *
+ * @param model The model object to query
+ * @param det_treshold The threshold to trigger wake words, the range of
+ * det_threshold is 0.5~0.9999
+ * @return 0: setting failed, 1: setting success
+ */
+typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
+
+/**
+ * @brief Get the voice activity detection threshold
+ *
+ * @param model The model object to query
+ * @returns the detection threshold
+ */
+typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model);
+
+/**
+ * @brief Feed samples of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param samples An array of 16-bit signed audio samples. The array size used
+ * can be queried by the get_samp_chunksize function.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
+
+/**
+ * @brief Feed MFCC of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param cq An array of 16-bit MFCC.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq);
+
+/**
+ * @brief Get MFCC of an audio stream
+ *
+ * @param model The model object to query
+ * @return MFCC data
+ */
+typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the triggered channel index. Channel index starts from zero
+ *
+ * @param model The model object to query
+ * @return The channel index
+ */
+typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
+
+/**
+ * @brief Clean all states of model
+ *
+ * @param model The model object to query
+ */
+typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model);
+
+/**
+ * @brief Destroy a model object
+ *
+ * @param model Model object to destroy
+ */
+typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model);
+
+/**
+ * This structure contains the functions used to do operations on a voice
+ * activity detection model.
+ */
+typedef struct {
+    esp_vadn_iface_op_create_t create;
+    esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_vadn_iface_op_get_channel_num_t get_channel_num;
+    esp_vadn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_vadn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
+    esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
+    esp_vadn_iface_op_detect_t detect;
+    esp_vadn_iface_op_detect_mfcc_t detect_mfcc;
+    esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data;
+    esp_vadn_iface_op_clean_t clean;
+    esp_vadn_iface_op_destroy_t destroy;
+} esp_vadn_iface_t;
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c5/esp_vadn_models.h
+++ b/include/esp32c5/esp_vadn_models.h
@ -0,0 +1,22 @@
+#pragma once
+#include "esp_vadn_iface.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The prefix of vadnet model name is used to filter all wakenet from availabel models.
+#define ESP_VADN_PREFIX "vadnet"
+
+/**
+ * @brief Get the wakenet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of wakenet
+ */
+const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name);
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c5/esp_wn_iface.h
+++ b/include/esp32c5/esp_wn_iface.h
@ -29,6 +29,7 @@ typedef enum {
    DET_MODE_2CH_95 = 3,
    DET_MODE_3CH_90 = 4,
    DET_MODE_3CH_95 = 5,
+	DET_MODE_90_COPY_PARAMS = 6,       // Aggressive
 } det_mode_t;

 typedef struct {
@ -110,12 +111,21 @@ typedef char* (*esp_wn_iface_op_get_word_name_t)(model_iface_data_t *model, int
 * @brief Set the detection threshold to manually abjust the probability 
 *
 * @param model The model object to query
- * @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.5~0.9999
+ * @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.4~0.9999
 * @param word_index The index of wake word
 * @return 0: setting failed, 1: setting success
 */
 typedef int (*esp_wn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold, int word_index);

+/**
+ * @brief Reset the threshold to its initial state  
+ *
+ * @param model The model object to query
+ * @return 0: setting failed, 1: setting success
+ */
+typedef int (*esp_wn_iface_op_reset_det_threshold_t)(model_iface_data_t *model);
+
+
 /**
 * @brief Get the wake word detection threshold of different modes
 *
@ -200,6 +210,7 @@ typedef struct {
    esp_wn_iface_op_get_word_num_t get_word_num;
    esp_wn_iface_op_get_word_name_t get_word_name;
    esp_wn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_wn_iface_op_reset_det_threshold_t reset_det_threshold;
    esp_wn_iface_op_get_det_threshold_t get_det_threshold;
    esp_wn_iface_op_get_triggered_channel_t  get_triggered_channel;
    esp_wn_iface_op_get_vol_gain_t get_vol_gain;
--- a/include/esp32c5/esp_wn_models.h
+++ b/include/esp32c5/esp_wn_models.h
@ -11,7 +11,7 @@ extern "C" {
 /**
 * @brief Get the wakenet handle from model name
 *
- * @param model_name   The name of model 
+ * @param model_name   The name of model
 * @returns The handle of wakenet
 */
 const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
@ -19,10 +19,10 @@ const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
 /**
 * @brief Get the wake word name from model name
 *
- * @param model_name   The name of model 
+ * @param model_name   The name of model
 * @returns The wake word name, like "alexa","hilexin","xiaoaitongxue"
 */
-char* esp_wn_wakeword_from_name(const char *model_name);
+char *esp_wn_wakeword_from_name(const char *model_name);

 #ifdef __cplusplus
 }
--- a/include/esp32c5/flite_g2p.h
+++ b/include/esp32c5/flite_g2p.h
@ -0,0 +1,20 @@
+#ifndef __FLITE_G2P_H__
+#define __FLITE_G2P_H__
+
+typedef struct {
+    int num_phonemes;
+    int phoneme_size;
+    char **phonemes;
+} flite_g2p_result;
+
+void flite_g2p_result_free(flite_g2p_result *result);
+
+flite_g2p_result *flite_g2p_get_result(const char *grapheme);
+
+void flite_g2p_result_print_string(flite_g2p_result *result, int map_phonemes);
+
+char *flite_g2p_result_get_string(flite_g2p_result *result, int map_phonemes);
+
+char *flite_g2p(const char *graphemes, int map_phonemes);
+
+#endif
--- a/include/esp32c6/esp_afe_aec.h
+++ b/include/esp32c6/esp_afe_aec.h
@ -2,9 +2,8 @@
 #ifndef _ESP_AFE_AEC_H_
 #define _ESP_AFE_AEC_H_

-
-#include "esp_afe_config.h"
 #include "esp_aec.h"
+#include "esp_afe_config.h"

 #include <stdint.h>

@ -13,19 +12,19 @@ extern "C" {
 #endif

 typedef struct {
-    aec_handle_t* handle;
+    aec_handle_t *handle;
    aec_mode_t mode;
    afe_pcm_config_t pcm_config;
    int frame_size;
-    int16_t  *data;
-}afe_aec_handle_t;
-
+    int16_t *data;
+} afe_aec_handle_t;

 /**
- * @brief Creates an instance to the AEC structure. 
- * 
- * @warning Currently only support 1 microphone channel and 1 playback channe. 
- * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
+ * @brief Creates an instance to the AEC structure.
+ *
+ * @warning Currently only support 1 microphone channel and 1 playback channe.
+ * If input has multiple microphone channels and playback channels, just the first microphone channel and playback
+ * channel will be selected.
 *
 * The input format, same as afe config:
 * M to represent the microphone channel
@ -37,7 +36,8 @@ typedef struct {
 *
 * @param input_format     The input format
 * @param filter_length    The length of filter. The larger the filter, the higher the CPU loading.
- *                         Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
+ *                         Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for
+ * esp32c5.
 * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
 * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
 *
@ -45,17 +45,17 @@ typedef struct {
 */
 afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);

-
 /**
 * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
- * 
+ *
 * @param inst        The instance of AEC.
- * @param indata      Input audio data, format is define by input_format. Note indata will be modified in function call.
- * @param outdata     Returns near-end signal with echo removed. 
+ * @param indata      Input audio data, format is define by input_format.
+ * @param outdata     Near-end signal with echo removed.  outdata must be 16-bit aligned.
+ *                    please use heap_caps_aligned_calloc(16, n, size, caps) to allocate an aligned chunk of memory

 * @return The bytes of outdata.
 */
-size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
+size_t afe_aec_process(afe_aec_handle_t *handel, const int16_t *indata, int16_t *outdata);

 /**
 * @brief Get frame size of AEC (the samples of one frame)
@ -64,7 +64,6 @@ size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outda
 */
 int afe_aec_get_chunksize(afe_aec_handle_t *handle);

-
 /**
 * @brief Free the AEC instance
 *
--- a/include/esp32c6/esp_afe_config.h
+++ b/include/esp32c6/esp_afe_config.h
@ -1,9 +1,15 @@
 #pragma once
 #include "esp_aec.h"
+#include "esp_agc.h"
+#include "esp_nsn_models.h"
+#include "esp_vad.h"
+#include "esp_vadn_models.h"
+#include "esp_wn_iface.h"
+#include "esp_wn_models.h"
+#include "model_path.h"
 #include "stdbool.h"
 #include "stdint.h"
 #include "stdlib.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -27,7 +33,8 @@ typedef enum {
 // Set AFE type
 typedef enum {
    AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
-    AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
+    AFE_TYPE_VC = 1, // Voice communication scenarios, 16KHz input, including nonlinear noise suppression
+    AFE_TYPE_VC_8K = 2, // Voice communication scenarios, 8KHz input, note that the input data must be 8KHz
 } afe_type_t;

 typedef enum {
@ -62,8 +69,220 @@ typedef enum {
    AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
 } afe_agc_mode_t;

+/**
+ * @brief Function to get the debug audio data
+ *
+ * @param data        The debug audio data which don't be modify. It should be copied away as soon as possible that
+ * avoid blocking for too long.
+ * @param data_size   The number of bytes of data.
+ * @returns
+ */
+typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);
+
+typedef enum {
+    AFE_DEBUG_HOOK_MASE_TASK_IN = 0,  // To get the input data of mase task
+    AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
+    AFE_DEBUG_HOOK_MAX = 2
+} afe_debug_hook_type_t;
+
+typedef struct {
+    afe_debug_hook_type_t hook_type;         // debug type of hook
+    afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
+} afe_debug_hook_t;
+
+typedef struct {
+    /********** AEC(Acoustic Echo Cancellation) **********/
+    bool aec_init;         // Whether to init aec
+    aec_mode_t aec_mode;   // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
+    int aec_filter_length; // The filter length of aec
+
+    /********** SE(Speech Enhancement, microphone array processing) **********/
+    bool se_init; // Whether to init se
+
+    /********** NS(Noise Suppression) **********/
+    bool ns_init;              // Whether to init ns
+    char *ns_model_name;       // Model name of ns
+    afe_ns_mode_t afe_ns_mode; // Model mode of ns
+
+    /********** VAD(Voice Activity Detection) **********/
+    bool vad_init;          // Whether to init vad
+    vad_mode_t vad_mode;    // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
+    char *vad_model_name;   // The model name of vad, If it is null, WebRTC VAD will be used.
+    int vad_min_speech_ms;  // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
+    int vad_min_noise_ms;   // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
+                            // 1000 ms
+    int vad_delay_ms;       // The delay of the first speech frame in ms, default: 128 ms
+                            // If you find vad cache can not cover all speech, please increase this value.
+    bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
+    bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
+
+    /********** WakeNet(Wake Word Engine) **********/
+    bool wakenet_init;
+    char *wakenet_model_name;   // The model name of wakenet 1
+    char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
+    det_mode_t wakenet_mode;    // The mode of wakenet
+
+    /********** AGC(Automatic Gain Control) **********/
+    bool agc_init; // Whether to init agc
+    afe_agc_mode_t
+        agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
+    int agc_compression_gain_db; // Compression gain in dB (default 9)
+    int agc_target_level_dbfs;   // Target level in -dBfs of envelope (default 3, means target level is -3 dBFS)
+
+    /********** General AFE(Audio Front End) parameter **********/
+    afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
+    afe_mode_t afe_mode;         // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    afe_type_t afe_type;         // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    int afe_perferred_core;      // The preferred core of afe se task, which is created in afe_create function.
+    int afe_perferred_priority;  // The preferred priority of afe se task, which is created in afe_create function.
+    int afe_ringbuf_size;        // The ring buffer size: the number of frame data in ring buffer.
+    afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
+    float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
+                           // directly on the output amplitude: out_linear_gain * amplitude.
+    bool debug_init;
+    bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
+                              // otherwise, select channel number by wakenet
+} afe_config_t;
+
+/**
+ * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
+ * on the chip target and input format. You can manually fine-tune it after creating the configuration
+ *
+ * The input format:
+ * M to represent the microphone channel
+ * R to represent the playback reference channel
+ * N to represent an unknown or unused channel
+ *
+ * For example, input_format="MMNR" indicates that the input data consists of four channels,
+ * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
+ *
+ * @param input_format     The input format
+ * @param models           Models from partition, which is configured by Kconfig
+ * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
+ * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+ *
+ * @return afe_config_t*  The default config of afe
+ */
+afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
+
+/**
+ * @brief Check AFE configuration and make sure it is correct.
+ *
+ * @warning If there is a configuration conflict, this function will modify some parameters.
+ * The guiding behind these modifications is to maintain the highest performance of the output audio and results.
+ * And remove the conflict between different algorithms.
+ *
+ * For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
+ * If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
+ *
+ * @param afe_config       Input AFE config
+ *
+ * @return afe_config_t*  The modified AFE config
+ */
+afe_config_t *afe_config_check(afe_config_t *afe_config);
+
+/**
+ * @brief Parse input format
+ *
+ * @param input_format The input format, same with afe_config_init() function
+ * @param pcm_config   The pcm config
+ *
+ * @return true if the input format is parsed successfully, otherwise false
+ */
+bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);
+
+/**
+ * @brief Parse I2S input data
+ *
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param mic_data     The output microphone data
+ * @param ref_data     The output playback reference data
+ * @param pcm_config   The pcm config
+ *
+ */
+void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);
+
+/**
+ * @brief Parse input data, from interleaved arrangement to contiguous arrangement
+ *
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param channel_num  The channel number of data
+ * @param out_data     The output data
+ *
+ */
+void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
+
+/**
+ * @brief Format input data, from contiguous arrangement to interleaved arrangement
+ *
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param channel_num  The channel number of data
+ * @param out_data     The output data
+ *
+ */
+void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
+
+/**
+ * @brief Adjust the gain of input data
+ *
+ * @warning the input data will be modified inplace.
+ *
+ * @param data         The input audio data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param factor       The gain factor
+ *
+ * @return int16_t*    The output audio data
+ */
+int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);
+
+/**
+ * @brief Adjust the gain of input data
+ *
+ * @warning the input data will be modified inplace.
+ *
+ * @param in_data         The input audio data
+ * @param in_frame_size   Input data frame size of input
+ * @param channel_num     The channel number of input data, which is same as output data
+ * @param out_data        The output audio data
+ * @param out_frame_size  Onput data frame size of input
+ *
+ */
+void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);
+
+/**
+ * @brief Copy the afe config
+ *
+ * @param dst_config    The destination afe config
+ * @param src_config    The source afe config
+ *
+ * @return   The destination afe config
+ */
+afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
+
+/**
+ * @brief Print the afe config
+ *
+ * @param afe_config    The afe config
+ */
+void afe_config_print(const afe_config_t *afe_config);
+
+/**
+ * @brief Allocate afe config
+ *
+ * @return The afe config pointer
+ */
+afe_config_t *afe_config_alloc();
+
+/**
+ * @brief Free afe config
+ *
+ * @param afe_config  The afe config pointer
+ */
+void afe_config_free(afe_config_t *afe_config);

 #ifdef __cplusplus
 }
 #endif
-
--- a/include/esp32c6/esp_afe_doa.h
+++ b/include/esp32c6/esp_afe_doa.h
@ -0,0 +1,48 @@
+#ifndef _ESP_AFE_DOA_H_
+#define _ESP_AFE_DOA_H_
+
+#include "esp_doa.h"
+#include "esp_afe_config.h"
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    doa_handle_t *doa_handle;
+    afe_pcm_config_t pcm_config;
+    int16_t *leftdata;
+    int16_t *rightdata;
+    int frame_size;
+} afe_doa_handle_t;
+
+/**
+ * @brief Initialize SRP-PHAT processor
+ * @param input_format     The input format
+ * @param fs Sampling rate (Hz), e.g., 16000
+ * @param resolution Angular search resolution (degrees), e.g., 20
+ * @param d_mics Microphone spacing (meters), e.g., 0.06
+ * @param input_timedate_samples input timedate samples, e.g., 1024
+ * @return Initialized doa_handle_t object pointer, Recommend using the above configuration for better performance
+ */
+afe_doa_handle_t *afe_doa_create(const char *input_format, int fs, float resolution, float d_mics, int input_timedate_samples);
+/**
+ * @brief Process audio frame for direction estimation
+ * @param handle doa_handle_t instance pointer
+ * @param indata Input audio data, format is define by input_format.
+ * @return Estimated sound direction in degrees, e.g., 0-180
+ */
+float afe_doa_process(afe_doa_handle_t *handle, const int16_t *indata);
+/**
+ * @brief Release all allocated resources
+ * @param doa doa_handle_t instance pointer to be freed
+ */
+void afe_doa_destroy(afe_doa_handle_t *handle);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ESP_AFE_DOA_H_ */
--- a/include/esp32c6/esp_afe_sr_iface.h
+++ b/include/esp32c6/esp_afe_sr_iface.h
@ -0,0 +1,237 @@
+#pragma once
+#include "esp_afe_config.h"
+#include "stdbool.h"
+#include "stdint.h"
+#include "stdlib.h"
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// AFE: Audio Front-End
+// SR:  Speech Recognition
+// afe_sr/AFE_SR: the audio front-end for speech recognition
+
+// Opaque AFE_SR data container
+typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
+
+/**
+ * @brief The state of vad
+ */
+typedef enum {
+    AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
+    AFE_VAD_SPEECH = 1   // Deprecated, please use vad_state_t, speech
+} afe_vad_state_t;
+
+/**
+ * @brief The result of fetch function
+ */
+typedef struct afe_fetch_result_t {
+    int16_t *data;      // the target channel data of audio.
+    int data_size;      // the size of data. The unit is byte.
+    int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
+                        // audio that was truncated.
+    int vad_cache_size; // the size of vad_cache. The unit is byte.
+    float data_volume;  // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
+                        // (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
+                        // wakenet(about 1.5s), otherwise is the frame length.
+    wakenet_state_t wakeup_state; // the value is wakenet_state_t
+    int wake_word_index;          // if the wake word is detected. It will store the wake word index which start from 1.
+    int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
+                             // start from 1.
+    vad_state_t vad_state;   // the value is afe_vad_state_t
+    int trigger_channel_id;  // the channel index of output
+    int wake_word_length;    // the length of wake word. The unit is the number of samples.
+    int ret_value;           // the return state of fetch function
+    int16_t *raw_data;       // the multi-channel output data of audio.
+    int raw_data_channels;   // the channel number of raw data
+    float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is buzy. 
+    void *reserved;          // reserved for future use
+} afe_fetch_result_t;
+
+/**
+ * @brief Function to initialze a AFE_SR instance
+ *
+ * @param afe_config        The config of AFE_SR
+ * @returns Handle to the AFE_SR data
+ */
+typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
+
+/**
+ * @brief Get the amount of each channel samples per frame that need to be passed to the function
+ *
+ * Every speech enhancement AFE_SR processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param afe The AFE_SR object to query
+ * @return The amount of samples to feed the fetch function
+ */
+typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Get the channel number
+ *
+ * @param afe   The AFE_SR object to query
+ * @return      The amount of total channels
+ */
+typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the function
+ *
+ * @param afe   The AFE_SR object to query
+ * @return      The sample rate, in hz
+ */
+typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Feed samples of an audio stream to the AFE_SR
+ *
+ * @Warning  The input data should be arranged in the format of channel interleaving.
+ *           The last channel is reference signal if it has reference data.
+ *
+ * @param afe   The AFE_SR object to query
+ *
+ * @param in    The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
+ *              `get_feed_chunksize`.
+ * @return      The size of input
+ */
+typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);
+
+/**
+ * @brief fetch enhanced samples of an audio stream from the AFE_SR
+ *
+ * @Warning  The output is single channel data, no matter how many channels the input is.
+ *           Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
+ *
+ * @param afe   The AFE_SR object to query
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
+ * audio can be queried by the `get_fetch_chunksize`.)
+ */
+typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
+ *
+ * @Warning  The output is single channel data, no matter how many channels the input is.
+ *
+ * @param afe            The AFE_SR object to query
+ * @param ticks_to_wait  The timeout value, in ticks, to wait for the fetch result.
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
+ * audio can be queried by the `get_fetch_chunksize`.)
+ */
+typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
+
+/**
+ * @brief reset ringbuf of AFE.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             -1: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Set wakenet detection threshold 
+ * 
+ * @param afe           The AFE_SR object to query
+ * @param index         The wakenet index, just support 1: wakenet1 or  2: wakenet2
+ * @param threshold     The wakenet detection threshold, the value is between 0.4 and 0.9999.
+ * @return             -1: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_set_wakenet_threshold_t)(esp_afe_sr_data_t *afe, int index, float threshold);
+
+/**
+ * @brief Reset wakenet detection threshold to inital state
+ * 
+ * @param afe           The AFE_SR object to query
+ * @param index         The wakenet index, just support 1: wakenet1 or  2: wakenet2
+ * @return             -1: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_reset_wakenet_threshold_t)(esp_afe_sr_data_t *afe, int index);
+
+/**
+ * @brief Reset one function/module/algorithm.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             -1: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Disable one function/module/algorithm.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             -1: fail, 0: disabled, 1: enabled
+ */
+typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Enable one function/module/algorithm.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             -1: fail, 0: disabled, 1: enabled
+ */
+typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Print all functions/modules/algorithms pipeline.
+ *       The pipeline is the order of the functions/modules/algorithms.
+ *       The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output]
+ *
+ * @param afe          The AFE_SR object to query
+ */
+typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Destroy a AFE_SR instance
+ *
+ * @param afe         AFE_SR object to destroy
+ */
+typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * This structure contains the functions used to do operations on a AFE_SR.
+ */
+typedef struct {
+    esp_afe_sr_iface_op_create_from_config_t create_from_config;
+    esp_afe_sr_iface_op_feed_t feed;
+    esp_afe_sr_iface_op_fetch_t fetch;
+    esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay;
+    esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
+    esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
+    esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
+    esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
+    esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
+    esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
+    esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
+    esp_afe_sr_iface_op_set_wakenet_threshold_t set_wakenet_threshold;
+    esp_afe_sr_iface_op_reset_wakenet_threshold_t reset_wakenet_threshold;
+    esp_afe_sr_iface_op_disable_func_t disable_wakenet;
+    esp_afe_sr_iface_op_enable_func_t enable_wakenet;
+    esp_afe_sr_iface_op_disable_func_t disable_aec;
+    esp_afe_sr_iface_op_enable_func_t enable_aec;
+    esp_afe_sr_iface_op_disable_func_t disable_se;
+    esp_afe_sr_iface_op_enable_func_t enable_se;
+    esp_afe_sr_iface_op_disable_func_t disable_vad;
+    esp_afe_sr_iface_op_enable_func_t enable_vad;
+    esp_afe_sr_iface_op_reset_op_t reset_vad;
+    esp_afe_sr_iface_op_disable_func_t disable_ns;
+    esp_afe_sr_iface_op_enable_func_t enable_ns;
+    esp_afe_sr_iface_op_disable_func_t disable_agc;
+    esp_afe_sr_iface_op_enable_func_t enable_agc;
+    esp_afe_sr_iface_op_print_pipeline_t print_pipeline;
+    esp_afe_sr_iface_op_destroy_t destroy;
+} esp_afe_sr_iface_t;
+
+// struct is used to store the AFE handle and data for the AFE task
+typedef struct {
+    esp_afe_sr_data_t *afe_data;
+    esp_afe_sr_iface_t *afe_handle;
+    TaskHandle_t feed_task;
+    TaskHandle_t fetch_task;
+} afe_task_into_t;
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c6/esp_afe_sr_models.h
+++ b/include/esp32c6/esp_afe_sr_models.h
@ -0,0 +1,13 @@
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "esp_afe_sr_iface.h"
+
+esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c6/esp_agc.h
+++ b/include/esp32c6/esp_agc.h
@ -0,0 +1,47 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_AGC_H_
+#define _ESP_AGC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+////all positive value is valid, negective is error
+typedef enum {
+    ESP_AGC_SUCCESS = 0,   ////success
+    ESP_AGC_FAIL = -1, ////agc fail
+    ESP_AGC_SAMPLE_RATE_ERROR = -2,  ///sample rate can be only 8khz, 16khz, 32khz
+    ESP_AGC_FRAME_SIZE_ERROR = -3,   ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size
+} ESP_AGE_ERR;
+
+typedef enum {
+    AGC_MODE_SR = -1,      // Bypass WEBRTC AGC
+    AGC_MODE_0 = 0,        // Only saturation protection
+    AGC_MODE_1 = 1,        // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
+    AGC_MODE_2 = 2,        // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
+    AGC_MODE_3 = 3,        // Fixed Digital Gain [compressionGaindB (default 8 dB)]
+} agc_mode_t;
+
+void *esp_agc_open(agc_mode_t agc_mode, int sample_rate);
+void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs);
+int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate);
+void esp_agc_close(void *agc_handle);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _ESP_AGC_H_
--- a/include/esp32c6/esp_doa.h
+++ b/include/esp32c6/esp_doa.h
@ -0,0 +1,41 @@
+#ifndef _ESP_DOA_H_
+#define _ESP_DOA_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct doa_handle_t doa_handle_t;
+/**
+ * @brief Initialize SRP-PHAT processor
+ * @param fs Sampling rate (Hz), e.g., 16000
+ * @param resolution Angular search resolution (degrees), e.g., 20
+ * @param d_mics Microphone spacing (meters), e.g., 0.06
+ * @param input_timedate_samples input timedate samples, e.g., 1024
+ * @return Initialized doa_handle_t object pointer, Recommend using the above configuration for better performance
+ */
+doa_handle_t *esp_doa_create(int fs, float resolution, float d_mics, int input_timedate_samples);
+
+/**
+ * @brief Release all allocated resources
+ * @param doa doa_handle_t instance pointer to be freed
+ */
+void esp_doa_destroy(doa_handle_t *doa);
+
+/**
+ * @brief Process audio frame for direction estimation
+ * @param doa doa_handle_t instance pointer
+ * @param left Left channel 16-bit PCM data
+ * @param right Right channel 16-bit PCM data
+ * @return Estimated sound direction in degrees, e.g., 0-180
+ */
+float esp_doa_process(doa_handle_t *doa, int16_t* left, int16_t* right);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ESP_DOA_H_ */
--- a/include/esp32c6/esp_mase.h
+++ b/include/esp32c6/esp_mase.h
@ -0,0 +1,93 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_MASE_H_
+#define _ESP_MASE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MASE_SAMPLE_RATE 16000        // Supports 16kHz only
+#define MASE_FRAME_SIZE 16            // Supports 16ms only
+#define MASE_MIC_DISTANCE 65          // According to physical design of mic-array
+
+/**
+ * @brief Sets mic-array type, currently 2-mic line array and 3-mic circular array 
+ * are supported.
+ */
+typedef enum {
+    TWO_MIC_LINE = 0,
+    THREE_MIC_CIRCLE = 1
+} mase_mic_array_type_t;
+
+/**
+ * @brief Sets operating mode, supporting normal mode and wake-up enhancement mode
+ */
+typedef enum {
+    NORMAL_ENHANCEMENT_MODE = 0,
+    WAKE_UP_ENHANCEMENT_MODE = 1
+} mase_op_mode_t;
+
+typedef void* mase_handle_t;
+
+/**
+ * @brief Creates an instance to the MASE structure.
+ *
+ * @param sample_rate       The sampling frequency (Hz) must be 16000.
+ *
+ * @param frame_size        The length of the audio processing must be 16ms.
+ *
+ * @param array_type        '0' for 2-mic line array and '1' for 3-mic circular array.
+ *
+ * @param mic_distance      The distance between neiboring microphones in mm.
+ *
+ * @param operating_mode	'0' for normal mode and '1' for wake-up enhanced mode.
+ *
+ * @param filter_strength	Strengh of the mic-array speech enhancement, must be 0, 1, 2 or 3.
+ * 
+ * @return
+ *         - NULL: Create failed
+ *         - Others: An instance of MASE
+ */
+mase_handle_t mase_create(int fs, int frame_size, int array_type, float mic_distance, int operating_mode, int filter_strength);
+
+/**
+ * @brief Performs mic array processing for one frame.
+ *
+ * @param inst        The instance of MASE.
+ *
+ * @param in          An array of 16-bit signed audio samples from mic.
+ *
+ * @param dsp_out     Returns enhanced signal.
+ *
+ * @return None
+ *
+ */
+void mase_process(mase_handle_t st, int16_t *in, int16_t *dsp_out);
+
+/**
+ * @brief Free the MASE instance
+ *
+ * @param inst The instance of MASE.
+ *
+ * @return None
+ *
+ */
+void mase_destory(mase_handle_t st);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/include/esp32c6/esp_mfcc_models.h
+++ b/include/esp32c6/esp_mfcc_models.h
@ -12,7 +12,7 @@ esp_mfcc_opts_t *get_mfcc_opts_wn9();
 /**
 * @brief Return basic opts used in wakenet9s
 **/
-esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
+esp_mfcc_opts_t *get_mfcc_opts(const char *win_type, bool use_power, int winstep_ms, int winlen_ms, int nfilter);

 /**
 * @brief Return basic opts for default kaldifeat
--- a/include/esp32c6/esp_mn_iface.h
+++ b/include/esp32c6/esp_mn_iface.h
@ -0,0 +1,223 @@
+#pragma once
+#include "stdint.h"
+#include "esp_wn_iface.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ESP_MN_RESULT_MAX_NUM 5
+#define ESP_MN_MAX_PHRASE_NUM 400
+#define ESP_MN_MAX_PHRASE_LEN 63
+#define ESP_MN_MIN_PHRASE_LEN 2
+
+#define ESP_MN_PREFIX "mn"
+#define ESP_MN_ENGLISH "en"
+#define ESP_MN_CHINESE "cn"
+
+typedef enum {
+    ESP_MN_STATE_DETECTING = 0,     // detecting
+    ESP_MN_STATE_DETECTED = 1,      // detected
+    ESP_MN_STATE_TIMEOUT = 2,       // time out
+} esp_mn_state_t;
+
+//Set multinet loading mode
+//The memory comsumption is decreased with increasing mode,
+//As a consequence also the CPU loading rate goes up
+typedef enum {
+    ESP_MN_LOAD_FROM_PSRAM = 0,          // Load all weights from PSRAM. Fastest computation with Maximum memory consumption
+    ESP_MN_LOAD_FROM_PSRAM_FLASH = 1,    // Load some weights from PSRAM and laod the rest from FLASH (default)
+    ESP_MN_LOAD_FROM_FLASH = 2,          // Load more weights from FLASH. Minimum memory consumption with slowest computation
+} esp_mn_loader_mode_t;
+
+typedef enum {
+    ESP_MN_GREEDY_SEARCH = 0,          // greedy search
+    ESP_MN_BEAM_SEARCH = 1,            // beam search
+    ESP_MN_BEAM_SEARCH_WITH_FST = 2,  // beam search with trie language model
+} esp_mn_search_method_t;
+
+typedef enum {
+    CHINESE_ID = 1,       // Chinese language
+    ENGLISH_ID = 2,       // English language
+} language_id_t;
+
+// Return all possible recognition results
+typedef struct{
+    esp_mn_state_t state;
+    int num;                                   // The number of phrase in list, num<=5. When num=0, no phrase is recognized.
+    int command_id[ESP_MN_RESULT_MAX_NUM];     // The list of command id.
+    int phrase_id[ESP_MN_RESULT_MAX_NUM];      // The list of phrase id.
+    float prob[ESP_MN_RESULT_MAX_NUM];         // The list of probability.
+    char string[256];
+} esp_mn_results_t;
+
+typedef struct {
+    char *string;                               // command string
+    char *phonemes;                             // command phonemes, if applicable
+    int16_t command_id;                         // the command id
+    float threshold;                            // trigger threshold, default: 0
+    int16_t *wave;                              // prompt wave data of the phrase
+} esp_mn_phrase_t;
+
+typedef struct _mn_node_ {
+    esp_mn_phrase_t *phrase;
+    struct _mn_node_ *next;
+} esp_mn_node_t;
+
+typedef struct{
+    int16_t num;                                // The number of error phrases, which can not added into model
+    esp_mn_phrase_t **phrases;                  // The array of error phrase pointer
+} esp_mn_error_t;
+
+/**
+ * @brief Initialze a model instance with specified model name.
+ *
+ * @param model_name  The wakenet model name.
+ * @param duration    The duration (ms) to trigger the timeout
+ *
+ * @returns Handle to the model data.
+ */
+typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const char *model_name, int duration);
+
+/**
+ * @brief Switch multinet mode to change memory consumption and CPU loading
+ *
+ * @warning Just Support multinet6 or later versions
+ *
+ * @param model The model object to query
+ * @param mode  The multinet loader mode
+ *
+ * @returns Handle to the model data.
+ */
+typedef model_iface_data_t* (*esp_mn_iface_op_switch_loader_mode_t)(model_iface_data_t *model, esp_mn_loader_mode_t mode);
+
+/**
+ * @brief Callback function type to fetch the amount of samples that need to be passed to the detect function
+ *
+ * Every speech recognition model processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model       The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_mn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
+
+/**
+ * @brief Callback function type to fetch the number of frames recognized by the command word
+ *
+ * @param model       The model object to query
+ * @return The number of the frames recognized by the command word
+ */
+typedef int (*esp_mn_iface_op_get_samp_chunknum_t)(model_iface_data_t *model);
+
+/**
+ * @brief Set the detection threshold to manually abjust the probability
+ *
+ * @param model The model object to query
+ * @param det_treshold The threshold to trigger speech commands, the range of det_threshold is 0.0~0.9999
+ */
+typedef int (*esp_mn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the detect function
+ *
+ * @param model       The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_mn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the language of model
+ *
+ * @param model       The language name
+ * @return Language name string defined in esp_mn_models.h, eg: ESP_MN_CHINESE, ESP_MN_ENGLISH
+ */
+typedef char * (*esp_mn_iface_op_get_language_t)(model_iface_data_t *model);
+
+/**
+ * @brief Feed samples of an audio stream to the speech recognition model and detect if there is a speech command found.
+ *
+ * @param model       The model object to query.
+ * @param samples     An array of 16-bit signed audio samples. The array size used can be queried by the
+ *                    get_samp_chunksize function.
+ * @return The state of multinet
+ */
+typedef esp_mn_state_t (*esp_mn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
+
+/**
+ * @brief Destroy a speech commands recognition model
+ *
+ * @param model       The Model object to destroy
+ */
+typedef void (*esp_mn_iface_op_destroy_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get recognition results
+ *
+ * @param model       The Model object to query
+ *
+ * @return The current results.
+ */
+typedef esp_mn_results_t* (*esp_mn_iface_op_get_results_t)(model_iface_data_t *model);
+
+/**
+ * @brief Open the log print
+ *
+ * @param model_data       The model object to query.
+ *
+ */
+typedef void (*esp_mn_iface_op_open_log_t)(model_iface_data_t *model_data);
+
+/**
+ * @brief Clean all status of model
+ *
+ * @param model_data       The model object to query.
+ *
+ */
+typedef void (*esp_mn_iface_op_clean_t)(model_iface_data_t *model_data);
+
+/**
+ * @brief Set the speech commands by mn_command_root
+ *
+ * @param model_data       The model object to query.
+ * @param mn_command_root  The speech commands link.
+ * @return The error phrase id info.
+ */
+typedef esp_mn_error_t* (*esp_wn_iface_op_set_speech_commands)(model_iface_data_t *model_data, esp_mn_node_t *mn_command_root);
+
+
+/**
+ * @brief Print out current commands in fst, note the ones "added" but not "updated" will not be shown here
+ *
+ * @param model_data     The model object to query
+*/
+typedef void (*esp_mn_iface_op_print_active_speech_commands)(model_iface_data_t *model_data);
+
+/**
+ * @brief Check if input string can be tokenized
+ *
+ * @param model_data     The model object to query
+ * @param str            The input string
+*/
+typedef int (*esp_mn_iface_op_check_speech_command)(model_iface_data_t *model_data, const char *str);
+
+typedef struct {
+    esp_mn_iface_op_create_t create;
+    esp_mn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_mn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_mn_iface_op_get_samp_chunknum_t get_samp_chunknum;
+    esp_mn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_mn_iface_op_get_language_t get_language;
+    esp_mn_iface_op_detect_t detect;
+    esp_mn_iface_op_destroy_t destroy;
+    esp_mn_iface_op_get_results_t get_results;
+    esp_mn_iface_op_open_log_t open_log;
+    esp_mn_iface_op_clean_t clean;
+    esp_wn_iface_op_set_speech_commands set_speech_commands;
+    esp_mn_iface_op_switch_loader_mode_t switch_loader_mode;
+    esp_mn_iface_op_print_active_speech_commands print_active_speech_commands;
+    esp_mn_iface_op_check_speech_command check_speech_command;
+} esp_mn_iface_t;
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c6/esp_mn_models.h
+++ b/include/esp32c6/esp_mn_models.h
@ -0,0 +1,66 @@
+#pragma once
+#include "esp_mn_iface.h"
+
+//Contains declarations of all available speech recognion models. Pair this up with the right coefficients and you have a model that can recognize
+//a specific phrase or word.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * @brief Get the multinet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of multinet
+ */
+esp_mn_iface_t *esp_mn_handle_from_name(char *model_name);
+
+/**
+ * @brief Get the multinet language from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The language of multinet
+ */
+char *esp_mn_language_from_name(char *model_name);
+
+/*
+ Configure wake word to use based on what's selected in menuconfig.
+*/
+
+#ifdef CONFIG_SR_MN_CN_MULTINET2_SINGLE_RECOGNITION
+#include "multinet2_ch.h"
+#define MULTINET_COEFF get_coeff_multinet2_ch
+#define MULTINET_MODEL_NAME "mn2_cn"
+
+#else
+#define MULTINET_COEFF      "COEFF_NULL"
+#define MULTINET_MODEL_NAME "NULL"
+#endif
+
+
+/* example
+
+static const esp_mn_iface_t *multinet = &MULTINET_MODEL;
+
+//Initialize MultiNet model data
+model_iface_data_t *model_data = multinet->create(&MULTINET_COEFF);
+add_speech_commands(multinet, model_data);
+
+//Set parameters of buffer
+int audio_chunksize=model->get_samp_chunksize(model_data);
+int frequency = model->get_samp_rate(model_data);
+int16_t *buffer=malloc(audio_chunksize*sizeof(int16_t));
+
+//Detect
+int r=model->detect(model_data, buffer);
+if (r>0) {
+    printf("Detection triggered output %d.\n",  r);
+}
+
+//Destroy model
+model->destroy(model_data)
+
+*/
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c6/esp_ns.h
+++ b/include/esp32c6/esp_ns.h
@ -0,0 +1,86 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_NS_H_
+#define _ESP_NS_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NS_USE_SPIARM       0
+#define NS_FRAME_LENGTH_MS     10          //Supports 10ms, 20ms, 30ms
+
+/**
+* The Sampling frequency (Hz) must be 16000Hz
+*/
+
+typedef void* ns_handle_t;
+
+/**
+ * @brief Creates an instance to the NS structure.
+ *
+ * @param frame_length   The length of the audio processing can be 10ms, 20ms, 30ms.
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of NS
+ */
+ns_handle_t ns_create(int frame_length);
+
+/**
+ * @brief Creates an instance of the more powerful noise suppression algorithm.
+ * 
+ * @warning frame_length only supports be 10 ms.
+ *
+ * @param frame_length    The length of the audio processing can only be 10ms.
+ * @param mode            0: Mild, 1: Medium, 2: Aggressive
+ * @param sample_rate     The sample rate of the audio. 
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of NS
+ */
+ns_handle_t ns_pro_create(int frame_length, int mode, int sample_rate);
+
+/**
+ * @brief Feed samples of an audio stream to the NS and get the audio stream after Noise suppression.
+ *
+ * @param inst        The instance of NS.
+ *
+ * @param indata      An array of 16-bit signed audio samples.
+ *
+ * @param outdata     An array of 16-bit signed audio samples after noise suppression.
+ *
+ * @return None
+ *
+ */
+void ns_process(ns_handle_t inst, int16_t *indata, int16_t *outdata);
+
+/**
+ * @brief Free the NS instance
+ *
+ * @param inst The instance of NS.
+ *
+ * @return None
+ *
+ */
+void ns_destroy(ns_handle_t inst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_NS_H_
--- a/include/esp32c6/esp_nsn_iface.h
+++ b/include/esp32c6/esp_nsn_iface.h
@ -0,0 +1,64 @@
+#pragma once
+#include "stdint.h"
+
+//Opaque model data container
+typedef struct esp_nsn_data_t esp_nsn_data_t;
+
+
+/**
+ * @brief Easy function type to initialze a model instance
+ *
+ * @param model_name The name of the model instance
+ * @returns Handle to the model data
+ */
+typedef esp_nsn_data_t* (*esp_nsn_iface_op_create_t)(char *model_name);
+
+/**
+ * @brief Get the amount of samples that need to be passed to the process function
+ *
+ * Every noise suppression model processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the process function
+ */
+typedef int (*esp_nsn_iface_op_get_samp_chunksize_t)(esp_nsn_data_t *model);
+
+/**
+ * @brief Feed samples of an audio stream to the noise suppression model and get data after process.
+ *
+ *
+ * @param model The model object to query
+ * @param in_data An array of 16-bit signed audio samples. The array size used can be queried by the 
+ *        get_samp_chunksize function.
+ * @param out_data An array of 16-bit signed audio samples after process.
+ * @return The state of return.
+ */
+typedef int (*esp_nsn_iface_op_process_t)(esp_nsn_data_t *model, int16_t *in_data, int16_t *out_data);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the process function
+ *
+ * @param model The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_nsn_iface_op_get_samp_rate_t)(esp_nsn_data_t *model);
+
+/**
+ * @brief Destroy a noise suppression model
+ *
+ * @param model Model object to destroy
+ */
+typedef void (*esp_nsn_iface_op_destroy_t)(esp_nsn_data_t *model);
+
+
+/**
+ * This structure contains the functions used to do operations on a wake word detection model.
+ */
+typedef struct {
+    esp_nsn_iface_op_create_t create;
+    esp_nsn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_nsn_iface_op_process_t process;
+    esp_nsn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_nsn_iface_op_destroy_t destroy;
+} esp_nsn_iface_t;
--- a/include/esp32c6/esp_nsn_models.h
+++ b/include/esp32c6/esp_nsn_models.h
@ -0,0 +1,17 @@
+#pragma once
+
+#include "esp_nsn_iface.h"
+
+/*
+The prefix of nset
+Now there are nsnet1 and nsnet2
+*/
+#define ESP_NSNET_PREFIX "nsnet"
+
+/**
+ * @brief Get the nsnet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of multinet
+ */
+esp_nsn_iface_t *esp_nsnet_handle_from_name(char *model_name);
--- a/include/esp32c6/esp_speech_features.h
+++ b/include/esp32c6/esp_speech_features.h
@ -48,7 +48,7 @@ float *esp_win_func_init(char *win_type, float *window_data, int frame_length);

 float *esp_fftr(float *x, int nfft, void *fft_table);

-float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
+float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_handle);

 void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);

--- a/include/esp32c6/esp_sr_webrtc.h
+++ b/include/esp32c6/esp_sr_webrtc.h
@ -0,0 +1,84 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_WEBRTC_H_
+#define _ESP_WEBRTC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "esp_agc.h"
+#include "esp_log.h"
+#include "esp_ns.h"
+#include "sr_ringbuf.h"
+#include <stdint.h>
+
+#include "esp_heap_caps.h"
+
+typedef struct {
+    void *ns_handle;
+    void *agc_handle;
+    int frame_size;
+    int sample_rate;
+    int16_t *buff;
+    int16_t *out_data;
+    sr_ringbuf_handle_t rb;
+} webrtc_handle_t;
+
+/**
+ * @brief Creates an instance of webrtc.
+ *
+ * @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms.
+ *
+ * @param frame_length_ms    The length of the audio processing
+ * @param ns_mode            The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive
+ * @param agc_mode           The model of AGC
+ * @param agc_gain           The gain of AGC. default is 9
+ * @param agc_target_level   The target level of AGC. default is -3 dbfs
+ * @param sample_rate        The sample rate of the audio.
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of webrtc
+ */
+webrtc_handle_t *webrtc_create(
+    int frame_length_ms, int ns_mode, agc_mode_t agc_mode, int agc_gain, int agc_target_level, int sample_rate);
+
+/**
+ * @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression.
+ *
+ * @param handle        The instance of NS.
+ * @param in_data       An array of 16-bit signed audio samples.
+ * @param out_size      The sample size of output data
+ * @param enable_ns     Enable noise suppression
+ * @param enable_agc    Enable automatic gain control
+ *
+ * @return data after noise suppression
+ */
+int16_t *webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc);
+
+/**
+ * @brief Free the webrtc instance
+ *
+ * @param handle The instance of webrtc.
+ *
+ * @return None
+ *
+ */
+void webrtc_destroy(webrtc_handle_t *handle);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_NS_H_
--- a/include/esp32c6/esp_vad.h
+++ b/include/esp32c6/esp_vad.h
@ -0,0 +1,178 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_VAD_H_
+#define _ESP_VAD_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SAMPLE_RATE_HZ 16000   // Supports 32000, 16000, 8000
+#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms
+
+/**
+ * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
+ * restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
+ */
+typedef enum {
+    VAD_MODE_0 = 0, // Normal
+    VAD_MODE_1,     // Aggressive
+    VAD_MODE_2,     // Very Aggressive
+    VAD_MODE_3,     // Very Very Aggressive
+    VAD_MODE_4      // Very Very Very Aggressive
+} vad_mode_t;
+
+typedef enum {
+    VAD_SILENCE = 0,
+    VAD_SPEECH = 1,
+} vad_state_t;
+
+typedef struct vad_trigger_tag {
+    vad_state_t state;
+    unsigned int min_speech_len;
+    unsigned int noise_len;
+    unsigned int min_noise_len;
+    unsigned int speech_len;
+} vad_trigger_t;
+
+#define vad_MAX_LEN INT32_MAX - 1
+/**
+ * @brief Allocate wakenet trigger
+ *
+ * @param min_speech_len  Minimum frame number of speech duration
+ * @param min_noise_len   Minimum frame number of noise duration
+ *
+ * @return Trigger pointer
+ **/
+vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
+
+/**
+ * @brief Free wakenet trigger
+ **/
+void vad_trigger_free(vad_trigger_t *trigger);
+
+/**
+ * @brief Reset wakenet trigger
+ **/
+void vad_trigger_reset(vad_trigger_t *trigger);
+
+/**
+ * @brief detect activaty voice by trigger
+ **/
+vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
+
+typedef struct {
+    vad_trigger_t *trigger;
+    void *vad_inst;
+    int sample_rate;
+    int frame_size;
+} vad_handle_with_trigger_t;
+
+typedef vad_handle_with_trigger_t *vad_handle_t;
+
+// typedef vad_handle_tag * vad_handle_t;
+
+/**
+ * @brief Creates an instance to the VAD structure.
+ *
+ * @param vad_mode          Sets the VAD operating mode.
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of VAD
+ */
+vad_handle_t vad_create(vad_mode_t vad_mode);
+
+/**
+ * @brief Creates an instance to the VAD structure.
+ *
+ * @param vad_mode          Sets the VAD operating mode.
+ * @param sample_rate       Sample rate in Hz
+ * @param one_frame_ms      Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30.
+ * @param min_speech_ms     Minimum speech duration, unit is ms
+ * @param min_noise_ms      Minimum noise duration, unit is ms
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of VAD
+ */
+vad_handle_t vad_create_with_param(
+    vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
+
+/**
+ * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
+ *
+ * @param handle            The instance of VAD.
+ * @param data              An array of 16-bit signed audio samples.
+ * @param sample_rate_hz    The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
+ * @param one_frame_ms      The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
+ * @return
+ *         - VAD_SILENCE if no voice
+ *         - VAD_SPEECH  if voice is detected
+ *
+ */
+vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms);
+
+/**
+ * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
+ *
+ * @param handle            The instance of VAD.
+ * @param data              An array of 16-bit signed audio samples.
+ * @return
+ *         - VAD_SILENCE if no voice
+ *         - VAD_SPEECH  if voice is detected
+ *
+ */
+vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
+
+/**
+ * @brief Reset trigger state as Silence
+ *
+ * @param handle            The instance of VAD.
+ */
+void vad_reset_trigger(vad_handle_t handle);
+
+/**
+ * @brief Free the VAD instance
+ *
+ * @param inst The instance of VAD.
+ *
+ * @return None
+ *
+ */
+void vad_destroy(vad_handle_t inst);
+
+/*
+ * Programming Guide:
+ *
+ * @code{c}
+ * vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);     // Creates an instance to
+ * the VAD structure.
+ *
+ * while (1) {
+ *    //Use buffer to receive the audio data from MIC.
+ *    vad_state_t vad_state = vad_process(vad_inst, buffer);      // Feed samples to the VAD process and get the result.
+ * }
+ *
+ * vad_destroy(vad_inst);   // Free the VAD instance at the end of whole VAD process
+ *
+ * @endcode
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_VAD_H_
--- a/include/esp32c6/esp_vadn_iface.h
+++ b/include/esp32c6/esp_vadn_iface.h
@ -0,0 +1,164 @@
+#pragma once
+#include "esp_vad.h"
+#include "stdint.h"
+#include "dl_lib_convq_queue.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque model data container
+typedef struct model_iface_data_t model_iface_data_t;
+
+// /**
+//  * @brief The state of vad
+//  */
+// typedef enum {
+//     VAD_NOISE = -1,  // Noise
+//     VADNET_STATE_SILENCE = 0, // Silence
+//     VAD_SPEECH = 1   // Speech
+// } vad_state_t;
+
+/**
+ * @brief Easy function type to initialze a model instance with a detection mode
+ * and specified model name
+ *
+ * @param model_name  The specified model name
+ * @param mode        The voice activity detection mode
+ * @param channel_num The number of input audio channels
+ * @param min_speech_ms  The minimum duration of speech in ms to trigger vad
+ * speech
+ * @param min_noise_ms   The minimum duration of noise in ms to trigger vad
+ * noise
+ * @returns Handle to the model data
+ */
+typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)(
+    const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms);
+
+/**
+ * @brief Get the amount of samples that need to be passed to the detect
+ * function
+ *
+ * Every speech recognition model processes a certain number of samples at the
+ * same time. This function can be used to query that amount. Note that the
+ * returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the channel number of samples that need to be passed to the detect
+ * function
+ *
+ * Every speech recognition model processes a certain number of samples at the
+ * same time. This function can be used to query that amount. Note that the
+ * returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the detect function
+ *
+ * @param model The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
+
+/**
+ * @brief Set the detection threshold to manually abjust the probability
+ *
+ * @param model The model object to query
+ * @param det_treshold The threshold to trigger wake words, the range of
+ * det_threshold is 0.5~0.9999
+ * @return 0: setting failed, 1: setting success
+ */
+typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
+
+/**
+ * @brief Get the voice activity detection threshold
+ *
+ * @param model The model object to query
+ * @returns the detection threshold
+ */
+typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model);
+
+/**
+ * @brief Feed samples of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param samples An array of 16-bit signed audio samples. The array size used
+ * can be queried by the get_samp_chunksize function.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
+
+/**
+ * @brief Feed MFCC of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param cq An array of 16-bit MFCC.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq);
+
+/**
+ * @brief Get MFCC of an audio stream
+ *
+ * @param model The model object to query
+ * @return MFCC data
+ */
+typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the triggered channel index. Channel index starts from zero
+ *
+ * @param model The model object to query
+ * @return The channel index
+ */
+typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
+
+/**
+ * @brief Clean all states of model
+ *
+ * @param model The model object to query
+ */
+typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model);
+
+/**
+ * @brief Destroy a model object
+ *
+ * @param model Model object to destroy
+ */
+typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model);
+
+/**
+ * This structure contains the functions used to do operations on a voice
+ * activity detection model.
+ */
+typedef struct {
+    esp_vadn_iface_op_create_t create;
+    esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_vadn_iface_op_get_channel_num_t get_channel_num;
+    esp_vadn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_vadn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
+    esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
+    esp_vadn_iface_op_detect_t detect;
+    esp_vadn_iface_op_detect_mfcc_t detect_mfcc;
+    esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data;
+    esp_vadn_iface_op_clean_t clean;
+    esp_vadn_iface_op_destroy_t destroy;
+} esp_vadn_iface_t;
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c6/esp_vadn_models.h
+++ b/include/esp32c6/esp_vadn_models.h
@ -0,0 +1,22 @@
+#pragma once
+#include "esp_vadn_iface.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The prefix of vadnet model name is used to filter all wakenet from availabel models.
+#define ESP_VADN_PREFIX "vadnet"
+
+/**
+ * @brief Get the wakenet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of wakenet
+ */
+const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name);
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32c6/esp_wn_iface.h
+++ b/include/esp32c6/esp_wn_iface.h
@ -29,6 +29,7 @@ typedef enum {
    DET_MODE_2CH_95 = 3,
    DET_MODE_3CH_90 = 4,
    DET_MODE_3CH_95 = 5,
+	DET_MODE_90_COPY_PARAMS = 6,       // Aggressive
 } det_mode_t;

 typedef struct {
@ -110,12 +111,21 @@ typedef char* (*esp_wn_iface_op_get_word_name_t)(model_iface_data_t *model, int
 * @brief Set the detection threshold to manually abjust the probability 
 *
 * @param model The model object to query
- * @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.5~0.9999
+ * @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.4~0.9999
 * @param word_index The index of wake word
 * @return 0: setting failed, 1: setting success
 */
 typedef int (*esp_wn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold, int word_index);

+/**
+ * @brief Reset the threshold to its initial state  
+ *
+ * @param model The model object to query
+ * @return 0: setting failed, 1: setting success
+ */
+typedef int (*esp_wn_iface_op_reset_det_threshold_t)(model_iface_data_t *model);
+
+
 /**
 * @brief Get the wake word detection threshold of different modes
 *
@ -200,6 +210,7 @@ typedef struct {
    esp_wn_iface_op_get_word_num_t get_word_num;
    esp_wn_iface_op_get_word_name_t get_word_name;
    esp_wn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_wn_iface_op_reset_det_threshold_t reset_det_threshold;
    esp_wn_iface_op_get_det_threshold_t get_det_threshold;
    esp_wn_iface_op_get_triggered_channel_t  get_triggered_channel;
    esp_wn_iface_op_get_vol_gain_t get_vol_gain;
--- a/include/esp32c6/esp_wn_models.h
+++ b/include/esp32c6/esp_wn_models.h
@ -11,7 +11,7 @@ extern "C" {
 /**
 * @brief Get the wakenet handle from model name
 *
- * @param model_name   The name of model 
+ * @param model_name   The name of model
 * @returns The handle of wakenet
 */
 const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
@ -19,10 +19,10 @@ const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
 /**
 * @brief Get the wake word name from model name
 *
- * @param model_name   The name of model 
+ * @param model_name   The name of model
 * @returns The wake word name, like "alexa","hilexin","xiaoaitongxue"
 */
-char* esp_wn_wakeword_from_name(const char *model_name);
+char *esp_wn_wakeword_from_name(const char *model_name);

 #ifdef __cplusplus
 }
--- a/include/esp32c6/flite_g2p.h
+++ b/include/esp32c6/flite_g2p.h
@ -0,0 +1,20 @@
+#ifndef __FLITE_G2P_H__
+#define __FLITE_G2P_H__
+
+typedef struct {
+    int num_phonemes;
+    int phoneme_size;
+    char **phonemes;
+} flite_g2p_result;
+
+void flite_g2p_result_free(flite_g2p_result *result);
+
+flite_g2p_result *flite_g2p_get_result(const char *grapheme);
+
+void flite_g2p_result_print_string(flite_g2p_result *result, int map_phonemes);
+
+char *flite_g2p_result_get_string(flite_g2p_result *result, int map_phonemes);
+
+char *flite_g2p(const char *graphemes, int map_phonemes);
+
+#endif
--- a/include/esp32p4/esp_mfcc_models.h
+++ b/include/esp32p4/esp_mfcc_models.h
@ -12,7 +12,7 @@ esp_mfcc_opts_t *get_mfcc_opts_wn9();
 /**
 * @brief Return basic opts used in wakenet9s
 **/
-esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
+esp_mfcc_opts_t *get_mfcc_opts(const char *win_type, bool use_power, int winstep_ms, int winlen_ms, int nfilter);

 /**
 * @brief Return basic opts for default kaldifeat
--- a/include/esp32s2/esp_afe_aec.h
+++ b/include/esp32s2/esp_afe_aec.h
@ -2,9 +2,8 @@
 #ifndef _ESP_AFE_AEC_H_
 #define _ESP_AFE_AEC_H_

-
-#include "esp_afe_config.h"
 #include "esp_aec.h"
+#include "esp_afe_config.h"

 #include <stdint.h>

@ -13,19 +12,19 @@ extern "C" {
 #endif

 typedef struct {
-    aec_handle_t* handle;
+    aec_handle_t *handle;
    aec_mode_t mode;
    afe_pcm_config_t pcm_config;
    int frame_size;
-    int16_t  *data;
-}afe_aec_handle_t;
-
+    int16_t *data;
+} afe_aec_handle_t;

 /**
- * @brief Creates an instance to the AEC structure. 
- * 
- * @warning Currently only support 1 microphone channel and 1 playback channe. 
- * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
+ * @brief Creates an instance to the AEC structure.
+ *
+ * @warning Currently only support 1 microphone channel and 1 playback channe.
+ * If input has multiple microphone channels and playback channels, just the first microphone channel and playback
+ * channel will be selected.
 *
 * The input format, same as afe config:
 * M to represent the microphone channel
@ -37,7 +36,8 @@ typedef struct {
 *
 * @param input_format     The input format
 * @param filter_length    The length of filter. The larger the filter, the higher the CPU loading.
- *                         Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
+ *                         Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for
+ * esp32c5.
 * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
 * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
 *
@ -45,17 +45,17 @@ typedef struct {
 */
 afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);

-
 /**
 * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
- * 
+ *
 * @param inst        The instance of AEC.
- * @param indata      Input audio data, format is define by input_format. Note indata will be modified in function call.
- * @param outdata     Returns near-end signal with echo removed. 
+ * @param indata      Input audio data, format is define by input_format.
+ * @param outdata     Near-end signal with echo removed.  outdata must be 16-bit aligned.
+ *                    please use heap_caps_aligned_calloc(16, n, size, caps) to allocate an aligned chunk of memory

 * @return The bytes of outdata.
 */
-size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
+size_t afe_aec_process(afe_aec_handle_t *handel, const int16_t *indata, int16_t *outdata);

 /**
 * @brief Get frame size of AEC (the samples of one frame)
@ -64,7 +64,6 @@ size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outda
 */
 int afe_aec_get_chunksize(afe_aec_handle_t *handle);

-
 /**
 * @brief Free the AEC instance
 *
--- a/include/esp32s2/esp_afe_config.h
+++ b/include/esp32s2/esp_afe_config.h
@ -1,9 +1,15 @@
 #pragma once
 #include "esp_aec.h"
+#include "esp_agc.h"
+#include "esp_nsn_models.h"
+#include "esp_vad.h"
+#include "esp_vadn_models.h"
+#include "esp_wn_iface.h"
+#include "esp_wn_models.h"
+#include "model_path.h"
 #include "stdbool.h"
 #include "stdint.h"
 #include "stdlib.h"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -27,7 +33,8 @@ typedef enum {
 // Set AFE type
 typedef enum {
    AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
-    AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
+    AFE_TYPE_VC = 1, // Voice communication scenarios, 16KHz input, including nonlinear noise suppression
+    AFE_TYPE_VC_8K = 2, // Voice communication scenarios, 8KHz input, note that the input data must be 8KHz
 } afe_type_t;

 typedef enum {
@ -62,8 +69,220 @@ typedef enum {
    AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
 } afe_agc_mode_t;

+/**
+ * @brief Function to get the debug audio data
+ *
+ * @param data        The debug audio data which don't be modify. It should be copied away as soon as possible that
+ * avoid blocking for too long.
+ * @param data_size   The number of bytes of data.
+ * @returns
+ */
+typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);
+
+typedef enum {
+    AFE_DEBUG_HOOK_MASE_TASK_IN = 0,  // To get the input data of mase task
+    AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
+    AFE_DEBUG_HOOK_MAX = 2
+} afe_debug_hook_type_t;
+
+typedef struct {
+    afe_debug_hook_type_t hook_type;         // debug type of hook
+    afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
+} afe_debug_hook_t;
+
+typedef struct {
+    /********** AEC(Acoustic Echo Cancellation) **********/
+    bool aec_init;         // Whether to init aec
+    aec_mode_t aec_mode;   // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
+    int aec_filter_length; // The filter length of aec
+
+    /********** SE(Speech Enhancement, microphone array processing) **********/
+    bool se_init; // Whether to init se
+
+    /********** NS(Noise Suppression) **********/
+    bool ns_init;              // Whether to init ns
+    char *ns_model_name;       // Model name of ns
+    afe_ns_mode_t afe_ns_mode; // Model mode of ns
+
+    /********** VAD(Voice Activity Detection) **********/
+    bool vad_init;          // Whether to init vad
+    vad_mode_t vad_mode;    // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
+    char *vad_model_name;   // The model name of vad, If it is null, WebRTC VAD will be used.
+    int vad_min_speech_ms;  // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
+    int vad_min_noise_ms;   // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
+                            // 1000 ms
+    int vad_delay_ms;       // The delay of the first speech frame in ms, default: 128 ms
+                            // If you find vad cache can not cover all speech, please increase this value.
+    bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
+    bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
+
+    /********** WakeNet(Wake Word Engine) **********/
+    bool wakenet_init;
+    char *wakenet_model_name;   // The model name of wakenet 1
+    char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
+    det_mode_t wakenet_mode;    // The mode of wakenet
+
+    /********** AGC(Automatic Gain Control) **********/
+    bool agc_init; // Whether to init agc
+    afe_agc_mode_t
+        agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
+    int agc_compression_gain_db; // Compression gain in dB (default 9)
+    int agc_target_level_dbfs;   // Target level in -dBfs of envelope (default 3, means target level is -3 dBFS)
+
+    /********** General AFE(Audio Front End) parameter **********/
+    afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
+    afe_mode_t afe_mode;         // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    afe_type_t afe_type;         // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    int afe_perferred_core;      // The preferred core of afe se task, which is created in afe_create function.
+    int afe_perferred_priority;  // The preferred priority of afe se task, which is created in afe_create function.
+    int afe_ringbuf_size;        // The ring buffer size: the number of frame data in ring buffer.
+    afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
+    float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
+                           // directly on the output amplitude: out_linear_gain * amplitude.
+    bool debug_init;
+    bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
+                              // otherwise, select channel number by wakenet
+} afe_config_t;
+
+/**
+ * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
+ * on the chip target and input format. You can manually fine-tune it after creating the configuration
+ *
+ * The input format:
+ * M to represent the microphone channel
+ * R to represent the playback reference channel
+ * N to represent an unknown or unused channel
+ *
+ * For example, input_format="MMNR" indicates that the input data consists of four channels,
+ * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
+ *
+ * @param input_format     The input format
+ * @param models           Models from partition, which is configured by Kconfig
+ * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
+ * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+ *
+ * @return afe_config_t*  The default config of afe
+ */
+afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
+
+/**
+ * @brief Check AFE configuration and make sure it is correct.
+ *
+ * @warning If there is a configuration conflict, this function will modify some parameters.
+ * The guiding behind these modifications is to maintain the highest performance of the output audio and results.
+ * And remove the conflict between different algorithms.
+ *
+ * For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
+ * If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
+ *
+ * @param afe_config       Input AFE config
+ *
+ * @return afe_config_t*  The modified AFE config
+ */
+afe_config_t *afe_config_check(afe_config_t *afe_config);
+
+/**
+ * @brief Parse input format
+ *
+ * @param input_format The input format, same with afe_config_init() function
+ * @param pcm_config   The pcm config
+ *
+ * @return true if the input format is parsed successfully, otherwise false
+ */
+bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);
+
+/**
+ * @brief Parse I2S input data
+ *
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param mic_data     The output microphone data
+ * @param ref_data     The output playback reference data
+ * @param pcm_config   The pcm config
+ *
+ */
+void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);
+
+/**
+ * @brief Parse input data, from interleaved arrangement to contiguous arrangement
+ *
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param channel_num  The channel number of data
+ * @param out_data     The output data
+ *
+ */
+void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
+
+/**
+ * @brief Format input data, from contiguous arrangement to interleaved arrangement
+ *
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param channel_num  The channel number of data
+ * @param out_data     The output data
+ *
+ */
+void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
+
+/**
+ * @brief Adjust the gain of input data
+ *
+ * @warning the input data will be modified inplace.
+ *
+ * @param data         The input audio data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param factor       The gain factor
+ *
+ * @return int16_t*    The output audio data
+ */
+int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);
+
+/**
+ * @brief Adjust the gain of input data
+ *
+ * @warning the input data will be modified inplace.
+ *
+ * @param in_data         The input audio data
+ * @param in_frame_size   Input data frame size of input
+ * @param channel_num     The channel number of input data, which is same as output data
+ * @param out_data        The output audio data
+ * @param out_frame_size  Onput data frame size of input
+ *
+ */
+void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);
+
+/**
+ * @brief Copy the afe config
+ *
+ * @param dst_config    The destination afe config
+ * @param src_config    The source afe config
+ *
+ * @return   The destination afe config
+ */
+afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
+
+/**
+ * @brief Print the afe config
+ *
+ * @param afe_config    The afe config
+ */
+void afe_config_print(const afe_config_t *afe_config);
+
+/**
+ * @brief Allocate afe config
+ *
+ * @return The afe config pointer
+ */
+afe_config_t *afe_config_alloc();
+
+/**
+ * @brief Free afe config
+ *
+ * @param afe_config  The afe config pointer
+ */
+void afe_config_free(afe_config_t *afe_config);

 #ifdef __cplusplus
 }
 #endif
-
--- a/include/esp32s2/esp_afe_doa.h
+++ b/include/esp32s2/esp_afe_doa.h
@ -0,0 +1,48 @@
+#ifndef _ESP_AFE_DOA_H_
+#define _ESP_AFE_DOA_H_
+
+#include "esp_doa.h"
+#include "esp_afe_config.h"
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    doa_handle_t *doa_handle;
+    afe_pcm_config_t pcm_config;
+    int16_t *leftdata;
+    int16_t *rightdata;
+    int frame_size;
+} afe_doa_handle_t;
+
+/**
+ * @brief Initialize SRP-PHAT processor
+ * @param input_format     The input format
+ * @param fs Sampling rate (Hz), e.g., 16000
+ * @param resolution Angular search resolution (degrees), e.g., 20
+ * @param d_mics Microphone spacing (meters), e.g., 0.06
+ * @param input_timedate_samples input timedate samples, e.g., 1024
+ * @return Initialized doa_handle_t object pointer, Recommend using the above configuration for better performance
+ */
+afe_doa_handle_t *afe_doa_create(const char *input_format, int fs, float resolution, float d_mics, int input_timedate_samples);
+/**
+ * @brief Process audio frame for direction estimation
+ * @param handle doa_handle_t instance pointer
+ * @param indata Input audio data, format is define by input_format.
+ * @return Estimated sound direction in degrees, e.g., 0-180
+ */
+float afe_doa_process(afe_doa_handle_t *handle, const int16_t *indata);
+/**
+ * @brief Release all allocated resources
+ * @param doa doa_handle_t instance pointer to be freed
+ */
+void afe_doa_destroy(afe_doa_handle_t *handle);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ESP_AFE_DOA_H_ */
--- a/include/esp32s2/esp_afe_sr_iface.h
+++ b/include/esp32s2/esp_afe_sr_iface.h
@ -0,0 +1,237 @@
+#pragma once
+#include "esp_afe_config.h"
+#include "stdbool.h"
+#include "stdint.h"
+#include "stdlib.h"
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// AFE: Audio Front-End
+// SR:  Speech Recognition
+// afe_sr/AFE_SR: the audio front-end for speech recognition
+
+// Opaque AFE_SR data container
+typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
+
+/**
+ * @brief The state of vad
+ */
+typedef enum {
+    AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
+    AFE_VAD_SPEECH = 1   // Deprecated, please use vad_state_t, speech
+} afe_vad_state_t;
+
+/**
+ * @brief The result of fetch function
+ */
+typedef struct afe_fetch_result_t {
+    int16_t *data;      // the target channel data of audio.
+    int data_size;      // the size of data. The unit is byte.
+    int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
+                        // audio that was truncated.
+    int vad_cache_size; // the size of vad_cache. The unit is byte.
+    float data_volume;  // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
+                        // (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
+                        // wakenet(about 1.5s), otherwise is the frame length.
+    wakenet_state_t wakeup_state; // the value is wakenet_state_t
+    int wake_word_index;          // if the wake word is detected. It will store the wake word index which start from 1.
+    int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
+                             // start from 1.
+    vad_state_t vad_state;   // the value is afe_vad_state_t
+    int trigger_channel_id;  // the channel index of output
+    int wake_word_length;    // the length of wake word. The unit is the number of samples.
+    int ret_value;           // the return state of fetch function
+    int16_t *raw_data;       // the multi-channel output data of audio.
+    int raw_data_channels;   // the channel number of raw data
+    float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is buzy. 
+    void *reserved;          // reserved for future use
+} afe_fetch_result_t;
+
+/**
+ * @brief Function to initialze a AFE_SR instance
+ *
+ * @param afe_config        The config of AFE_SR
+ * @returns Handle to the AFE_SR data
+ */
+typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
+
+/**
+ * @brief Get the amount of each channel samples per frame that need to be passed to the function
+ *
+ * Every speech enhancement AFE_SR processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param afe The AFE_SR object to query
+ * @return The amount of samples to feed the fetch function
+ */
+typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Get the channel number
+ *
+ * @param afe   The AFE_SR object to query
+ * @return      The amount of total channels
+ */
+typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the function
+ *
+ * @param afe   The AFE_SR object to query
+ * @return      The sample rate, in hz
+ */
+typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Feed samples of an audio stream to the AFE_SR
+ *
+ * @Warning  The input data should be arranged in the format of channel interleaving.
+ *           The last channel is reference signal if it has reference data.
+ *
+ * @param afe   The AFE_SR object to query
+ *
+ * @param in    The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
+ *              `get_feed_chunksize`.
+ * @return      The size of input
+ */
+typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);
+
+/**
+ * @brief fetch enhanced samples of an audio stream from the AFE_SR
+ *
+ * @Warning  The output is single channel data, no matter how many channels the input is.
+ *           Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
+ *
+ * @param afe   The AFE_SR object to query
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
+ * audio can be queried by the `get_fetch_chunksize`.)
+ */
+typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
+ *
+ * @Warning  The output is single channel data, no matter how many channels the input is.
+ *
+ * @param afe            The AFE_SR object to query
+ * @param ticks_to_wait  The timeout value, in ticks, to wait for the fetch result.
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
+ * audio can be queried by the `get_fetch_chunksize`.)
+ */
+typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
+
+/**
+ * @brief reset ringbuf of AFE.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             -1: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Set wakenet detection threshold 
+ * 
+ * @param afe           The AFE_SR object to query
+ * @param index         The wakenet index, just support 1: wakenet1 or  2: wakenet2
+ * @param threshold     The wakenet detection threshold, the value is between 0.4 and 0.9999.
+ * @return             -1: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_set_wakenet_threshold_t)(esp_afe_sr_data_t *afe, int index, float threshold);
+
+/**
+ * @brief Reset wakenet detection threshold to inital state
+ * 
+ * @param afe           The AFE_SR object to query
+ * @param index         The wakenet index, just support 1: wakenet1 or  2: wakenet2
+ * @return             -1: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_reset_wakenet_threshold_t)(esp_afe_sr_data_t *afe, int index);
+
+/**
+ * @brief Reset one function/module/algorithm.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             -1: fail, 1: success
+ */
+typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Disable one function/module/algorithm.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             -1: fail, 0: disabled, 1: enabled
+ */
+typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Enable one function/module/algorithm.
+ *
+ * @param afe          The AFE_SR object to query
+ * @return             -1: fail, 0: disabled, 1: enabled
+ */
+typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Print all functions/modules/algorithms pipeline.
+ *       The pipeline is the order of the functions/modules/algorithms.
+ *       The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output]
+ *
+ * @param afe          The AFE_SR object to query
+ */
+typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Destroy a AFE_SR instance
+ *
+ * @param afe         AFE_SR object to destroy
+ */
+typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * This structure contains the functions used to do operations on a AFE_SR.
+ */
+typedef struct {
+    esp_afe_sr_iface_op_create_from_config_t create_from_config;
+    esp_afe_sr_iface_op_feed_t feed;
+    esp_afe_sr_iface_op_fetch_t fetch;
+    esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay;
+    esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
+    esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
+    esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
+    esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
+    esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
+    esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
+    esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
+    esp_afe_sr_iface_op_set_wakenet_threshold_t set_wakenet_threshold;
+    esp_afe_sr_iface_op_reset_wakenet_threshold_t reset_wakenet_threshold;
+    esp_afe_sr_iface_op_disable_func_t disable_wakenet;
+    esp_afe_sr_iface_op_enable_func_t enable_wakenet;
+    esp_afe_sr_iface_op_disable_func_t disable_aec;
+    esp_afe_sr_iface_op_enable_func_t enable_aec;
+    esp_afe_sr_iface_op_disable_func_t disable_se;
+    esp_afe_sr_iface_op_enable_func_t enable_se;
+    esp_afe_sr_iface_op_disable_func_t disable_vad;
+    esp_afe_sr_iface_op_enable_func_t enable_vad;
+    esp_afe_sr_iface_op_reset_op_t reset_vad;
+    esp_afe_sr_iface_op_disable_func_t disable_ns;
+    esp_afe_sr_iface_op_enable_func_t enable_ns;
+    esp_afe_sr_iface_op_disable_func_t disable_agc;
+    esp_afe_sr_iface_op_enable_func_t enable_agc;
+    esp_afe_sr_iface_op_print_pipeline_t print_pipeline;
+    esp_afe_sr_iface_op_destroy_t destroy;
+} esp_afe_sr_iface_t;
+
+// struct is used to store the AFE handle and data for the AFE task
+typedef struct {
+    esp_afe_sr_data_t *afe_data;
+    esp_afe_sr_iface_t *afe_handle;
+    TaskHandle_t feed_task;
+    TaskHandle_t fetch_task;
+} afe_task_into_t;
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32s2/esp_afe_sr_models.h
+++ b/include/esp32s2/esp_afe_sr_models.h
@ -0,0 +1,13 @@
+#pragma once
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "esp_afe_sr_iface.h"
+
+esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32s2/esp_agc.h
+++ b/include/esp32s2/esp_agc.h
@ -0,0 +1,47 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_AGC_H_
+#define _ESP_AGC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+////all positive value is valid, negective is error
+typedef enum {
+    ESP_AGC_SUCCESS = 0,   ////success
+    ESP_AGC_FAIL = -1, ////agc fail
+    ESP_AGC_SAMPLE_RATE_ERROR = -2,  ///sample rate can be only 8khz, 16khz, 32khz
+    ESP_AGC_FRAME_SIZE_ERROR = -3,   ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size
+} ESP_AGE_ERR;
+
+typedef enum {
+    AGC_MODE_SR = -1,      // Bypass WEBRTC AGC
+    AGC_MODE_0 = 0,        // Only saturation protection
+    AGC_MODE_1 = 1,        // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
+    AGC_MODE_2 = 2,        // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
+    AGC_MODE_3 = 3,        // Fixed Digital Gain [compressionGaindB (default 8 dB)]
+} agc_mode_t;
+
+void *esp_agc_open(agc_mode_t agc_mode, int sample_rate);
+void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs);
+int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate);
+void esp_agc_close(void *agc_handle);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // _ESP_AGC_H_
--- a/include/esp32s2/esp_doa.h
+++ b/include/esp32s2/esp_doa.h
@ -0,0 +1,41 @@
+#ifndef _ESP_DOA_H_
+#define _ESP_DOA_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct doa_handle_t doa_handle_t;
+/**
+ * @brief Initialize SRP-PHAT processor
+ * @param fs Sampling rate (Hz), e.g., 16000
+ * @param resolution Angular search resolution (degrees), e.g., 20
+ * @param d_mics Microphone spacing (meters), e.g., 0.06
+ * @param input_timedate_samples input timedate samples, e.g., 1024
+ * @return Initialized doa_handle_t object pointer, Recommend using the above configuration for better performance
+ */
+doa_handle_t *esp_doa_create(int fs, float resolution, float d_mics, int input_timedate_samples);
+
+/**
+ * @brief Release all allocated resources
+ * @param doa doa_handle_t instance pointer to be freed
+ */
+void esp_doa_destroy(doa_handle_t *doa);
+
+/**
+ * @brief Process audio frame for direction estimation
+ * @param doa doa_handle_t instance pointer
+ * @param left Left channel 16-bit PCM data
+ * @param right Right channel 16-bit PCM data
+ * @return Estimated sound direction in degrees, e.g., 0-180
+ */
+float esp_doa_process(doa_handle_t *doa, int16_t* left, int16_t* right);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _ESP_DOA_H_ */
--- a/include/esp32s2/esp_mase.h
+++ b/include/esp32s2/esp_mase.h
@ -0,0 +1,93 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_MASE_H_
+#define _ESP_MASE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define MASE_SAMPLE_RATE 16000        // Supports 16kHz only
+#define MASE_FRAME_SIZE 16            // Supports 16ms only
+#define MASE_MIC_DISTANCE 65          // According to physical design of mic-array
+
+/**
+ * @brief Sets mic-array type, currently 2-mic line array and 3-mic circular array 
+ * are supported.
+ */
+typedef enum {
+    TWO_MIC_LINE = 0,
+    THREE_MIC_CIRCLE = 1
+} mase_mic_array_type_t;
+
+/**
+ * @brief Sets operating mode, supporting normal mode and wake-up enhancement mode
+ */
+typedef enum {
+    NORMAL_ENHANCEMENT_MODE = 0,
+    WAKE_UP_ENHANCEMENT_MODE = 1
+} mase_op_mode_t;
+
+typedef void* mase_handle_t;
+
+/**
+ * @brief Creates an instance to the MASE structure.
+ *
+ * @param sample_rate       The sampling frequency (Hz) must be 16000.
+ *
+ * @param frame_size        The length of the audio processing must be 16ms.
+ *
+ * @param array_type        '0' for 2-mic line array and '1' for 3-mic circular array.
+ *
+ * @param mic_distance      The distance between neiboring microphones in mm.
+ *
+ * @param operating_mode	'0' for normal mode and '1' for wake-up enhanced mode.
+ *
+ * @param filter_strength	Strengh of the mic-array speech enhancement, must be 0, 1, 2 or 3.
+ * 
+ * @return
+ *         - NULL: Create failed
+ *         - Others: An instance of MASE
+ */
+mase_handle_t mase_create(int fs, int frame_size, int array_type, float mic_distance, int operating_mode, int filter_strength);
+
+/**
+ * @brief Performs mic array processing for one frame.
+ *
+ * @param inst        The instance of MASE.
+ *
+ * @param in          An array of 16-bit signed audio samples from mic.
+ *
+ * @param dsp_out     Returns enhanced signal.
+ *
+ * @return None
+ *
+ */
+void mase_process(mase_handle_t st, int16_t *in, int16_t *dsp_out);
+
+/**
+ * @brief Free the MASE instance
+ *
+ * @param inst The instance of MASE.
+ *
+ * @return None
+ *
+ */
+void mase_destory(mase_handle_t st);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
--- a/include/esp32s2/esp_mfcc_models.h
+++ b/include/esp32s2/esp_mfcc_models.h
@ -12,7 +12,7 @@ esp_mfcc_opts_t *get_mfcc_opts_wn9();
 /**
 * @brief Return basic opts used in wakenet9s
 **/
-esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
+esp_mfcc_opts_t *get_mfcc_opts(const char *win_type, bool use_power, int winstep_ms, int winlen_ms, int nfilter);

 /**
 * @brief Return basic opts for default kaldifeat
--- a/include/esp32s2/esp_mn_iface.h
+++ b/include/esp32s2/esp_mn_iface.h
@ -0,0 +1,223 @@
+#pragma once
+#include "stdint.h"
+#include "esp_wn_iface.h"
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define ESP_MN_RESULT_MAX_NUM 5
+#define ESP_MN_MAX_PHRASE_NUM 400
+#define ESP_MN_MAX_PHRASE_LEN 63
+#define ESP_MN_MIN_PHRASE_LEN 2
+
+#define ESP_MN_PREFIX "mn"
+#define ESP_MN_ENGLISH "en"
+#define ESP_MN_CHINESE "cn"
+
+typedef enum {
+    ESP_MN_STATE_DETECTING = 0,     // detecting
+    ESP_MN_STATE_DETECTED = 1,      // detected
+    ESP_MN_STATE_TIMEOUT = 2,       // time out
+} esp_mn_state_t;
+
+//Set multinet loading mode
+//The memory comsumption is decreased with increasing mode,
+//As a consequence also the CPU loading rate goes up
+typedef enum {
+    ESP_MN_LOAD_FROM_PSRAM = 0,          // Load all weights from PSRAM. Fastest computation with Maximum memory consumption
+    ESP_MN_LOAD_FROM_PSRAM_FLASH = 1,    // Load some weights from PSRAM and laod the rest from FLASH (default)
+    ESP_MN_LOAD_FROM_FLASH = 2,          // Load more weights from FLASH. Minimum memory consumption with slowest computation
+} esp_mn_loader_mode_t;
+
+typedef enum {
+    ESP_MN_GREEDY_SEARCH = 0,          // greedy search
+    ESP_MN_BEAM_SEARCH = 1,            // beam search
+    ESP_MN_BEAM_SEARCH_WITH_FST = 2,  // beam search with trie language model
+} esp_mn_search_method_t;
+
+typedef enum {
+    CHINESE_ID = 1,       // Chinese language
+    ENGLISH_ID = 2,       // English language
+} language_id_t;
+
+// Return all possible recognition results
+typedef struct{
+    esp_mn_state_t state;
+    int num;                                   // The number of phrase in list, num<=5. When num=0, no phrase is recognized.
+    int command_id[ESP_MN_RESULT_MAX_NUM];     // The list of command id.
+    int phrase_id[ESP_MN_RESULT_MAX_NUM];      // The list of phrase id.
+    float prob[ESP_MN_RESULT_MAX_NUM];         // The list of probability.
+    char string[256];
+} esp_mn_results_t;
+
+typedef struct {
+    char *string;                               // command string
+    char *phonemes;                             // command phonemes, if applicable
+    int16_t command_id;                         // the command id
+    float threshold;                            // trigger threshold, default: 0
+    int16_t *wave;                              // prompt wave data of the phrase
+} esp_mn_phrase_t;
+
+typedef struct _mn_node_ {
+    esp_mn_phrase_t *phrase;
+    struct _mn_node_ *next;
+} esp_mn_node_t;
+
+typedef struct{
+    int16_t num;                                // The number of error phrases, which can not added into model
+    esp_mn_phrase_t **phrases;                  // The array of error phrase pointer
+} esp_mn_error_t;
+
+/**
+ * @brief Initialze a model instance with specified model name.
+ *
+ * @param model_name  The wakenet model name.
+ * @param duration    The duration (ms) to trigger the timeout
+ *
+ * @returns Handle to the model data.
+ */
+typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const char *model_name, int duration);
+
+/**
+ * @brief Switch multinet mode to change memory consumption and CPU loading
+ *
+ * @warning Just Support multinet6 or later versions
+ *
+ * @param model The model object to query
+ * @param mode  The multinet loader mode
+ *
+ * @returns Handle to the model data.
+ */
+typedef model_iface_data_t* (*esp_mn_iface_op_switch_loader_mode_t)(model_iface_data_t *model, esp_mn_loader_mode_t mode);
+
+/**
+ * @brief Callback function type to fetch the amount of samples that need to be passed to the detect function
+ *
+ * Every speech recognition model processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model       The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_mn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
+
+/**
+ * @brief Callback function type to fetch the number of frames recognized by the command word
+ *
+ * @param model       The model object to query
+ * @return The number of the frames recognized by the command word
+ */
+typedef int (*esp_mn_iface_op_get_samp_chunknum_t)(model_iface_data_t *model);
+
+/**
+ * @brief Set the detection threshold to manually abjust the probability
+ *
+ * @param model The model object to query
+ * @param det_treshold The threshold to trigger speech commands, the range of det_threshold is 0.0~0.9999
+ */
+typedef int (*esp_mn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the detect function
+ *
+ * @param model       The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_mn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the language of model
+ *
+ * @param model       The language name
+ * @return Language name string defined in esp_mn_models.h, eg: ESP_MN_CHINESE, ESP_MN_ENGLISH
+ */
+typedef char * (*esp_mn_iface_op_get_language_t)(model_iface_data_t *model);
+
+/**
+ * @brief Feed samples of an audio stream to the speech recognition model and detect if there is a speech command found.
+ *
+ * @param model       The model object to query.
+ * @param samples     An array of 16-bit signed audio samples. The array size used can be queried by the
+ *                    get_samp_chunksize function.
+ * @return The state of multinet
+ */
+typedef esp_mn_state_t (*esp_mn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
+
+/**
+ * @brief Destroy a speech commands recognition model
+ *
+ * @param model       The Model object to destroy
+ */
+typedef void (*esp_mn_iface_op_destroy_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get recognition results
+ *
+ * @param model       The Model object to query
+ *
+ * @return The current results.
+ */
+typedef esp_mn_results_t* (*esp_mn_iface_op_get_results_t)(model_iface_data_t *model);
+
+/**
+ * @brief Open the log print
+ *
+ * @param model_data       The model object to query.
+ *
+ */
+typedef void (*esp_mn_iface_op_open_log_t)(model_iface_data_t *model_data);
+
+/**
+ * @brief Clean all status of model
+ *
+ * @param model_data       The model object to query.
+ *
+ */
+typedef void (*esp_mn_iface_op_clean_t)(model_iface_data_t *model_data);
+
+/**
+ * @brief Set the speech commands by mn_command_root
+ *
+ * @param model_data       The model object to query.
+ * @param mn_command_root  The speech commands link.
+ * @return The error phrase id info.
+ */
+typedef esp_mn_error_t* (*esp_wn_iface_op_set_speech_commands)(model_iface_data_t *model_data, esp_mn_node_t *mn_command_root);
+
+
+/**
+ * @brief Print out current commands in fst, note the ones "added" but not "updated" will not be shown here
+ *
+ * @param model_data     The model object to query
+*/
+typedef void (*esp_mn_iface_op_print_active_speech_commands)(model_iface_data_t *model_data);
+
+/**
+ * @brief Check if input string can be tokenized
+ *
+ * @param model_data     The model object to query
+ * @param str            The input string
+*/
+typedef int (*esp_mn_iface_op_check_speech_command)(model_iface_data_t *model_data, const char *str);
+
+typedef struct {
+    esp_mn_iface_op_create_t create;
+    esp_mn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_mn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_mn_iface_op_get_samp_chunknum_t get_samp_chunknum;
+    esp_mn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_mn_iface_op_get_language_t get_language;
+    esp_mn_iface_op_detect_t detect;
+    esp_mn_iface_op_destroy_t destroy;
+    esp_mn_iface_op_get_results_t get_results;
+    esp_mn_iface_op_open_log_t open_log;
+    esp_mn_iface_op_clean_t clean;
+    esp_wn_iface_op_set_speech_commands set_speech_commands;
+    esp_mn_iface_op_switch_loader_mode_t switch_loader_mode;
+    esp_mn_iface_op_print_active_speech_commands print_active_speech_commands;
+    esp_mn_iface_op_check_speech_command check_speech_command;
+} esp_mn_iface_t;
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32s2/esp_mn_models.h
+++ b/include/esp32s2/esp_mn_models.h
@ -0,0 +1,66 @@
+#pragma once
+#include "esp_mn_iface.h"
+
+//Contains declarations of all available speech recognion models. Pair this up with the right coefficients and you have a model that can recognize
+//a specific phrase or word.
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * @brief Get the multinet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of multinet
+ */
+esp_mn_iface_t *esp_mn_handle_from_name(char *model_name);
+
+/**
+ * @brief Get the multinet language from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The language of multinet
+ */
+char *esp_mn_language_from_name(char *model_name);
+
+/*
+ Configure wake word to use based on what's selected in menuconfig.
+*/
+
+#ifdef CONFIG_SR_MN_CN_MULTINET2_SINGLE_RECOGNITION
+#include "multinet2_ch.h"
+#define MULTINET_COEFF get_coeff_multinet2_ch
+#define MULTINET_MODEL_NAME "mn2_cn"
+
+#else
+#define MULTINET_COEFF      "COEFF_NULL"
+#define MULTINET_MODEL_NAME "NULL"
+#endif
+
+
+/* example
+
+static const esp_mn_iface_t *multinet = &MULTINET_MODEL;
+
+//Initialize MultiNet model data
+model_iface_data_t *model_data = multinet->create(&MULTINET_COEFF);
+add_speech_commands(multinet, model_data);
+
+//Set parameters of buffer
+int audio_chunksize=model->get_samp_chunksize(model_data);
+int frequency = model->get_samp_rate(model_data);
+int16_t *buffer=malloc(audio_chunksize*sizeof(int16_t));
+
+//Detect
+int r=model->detect(model_data, buffer);
+if (r>0) {
+    printf("Detection triggered output %d.\n",  r);
+}
+
+//Destroy model
+model->destroy(model_data)
+
+*/
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32s2/esp_ns.h
+++ b/include/esp32s2/esp_ns.h
@ -0,0 +1,86 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_NS_H_
+#define _ESP_NS_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NS_USE_SPIARM       0
+#define NS_FRAME_LENGTH_MS     10          //Supports 10ms, 20ms, 30ms
+
+/**
+* The Sampling frequency (Hz) must be 16000Hz
+*/
+
+typedef void* ns_handle_t;
+
+/**
+ * @brief Creates an instance to the NS structure.
+ *
+ * @param frame_length   The length of the audio processing can be 10ms, 20ms, 30ms.
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of NS
+ */
+ns_handle_t ns_create(int frame_length);
+
+/**
+ * @brief Creates an instance of the more powerful noise suppression algorithm.
+ * 
+ * @warning frame_length only supports be 10 ms.
+ *
+ * @param frame_length    The length of the audio processing can only be 10ms.
+ * @param mode            0: Mild, 1: Medium, 2: Aggressive
+ * @param sample_rate     The sample rate of the audio. 
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of NS
+ */
+ns_handle_t ns_pro_create(int frame_length, int mode, int sample_rate);
+
+/**
+ * @brief Feed samples of an audio stream to the NS and get the audio stream after Noise suppression.
+ *
+ * @param inst        The instance of NS.
+ *
+ * @param indata      An array of 16-bit signed audio samples.
+ *
+ * @param outdata     An array of 16-bit signed audio samples after noise suppression.
+ *
+ * @return None
+ *
+ */
+void ns_process(ns_handle_t inst, int16_t *indata, int16_t *outdata);
+
+/**
+ * @brief Free the NS instance
+ *
+ * @param inst The instance of NS.
+ *
+ * @return None
+ *
+ */
+void ns_destroy(ns_handle_t inst);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_NS_H_
--- a/include/esp32s2/esp_nsn_iface.h
+++ b/include/esp32s2/esp_nsn_iface.h
@ -0,0 +1,64 @@
+#pragma once
+#include "stdint.h"
+
+//Opaque model data container
+typedef struct esp_nsn_data_t esp_nsn_data_t;
+
+
+/**
+ * @brief Easy function type to initialze a model instance
+ *
+ * @param model_name The name of the model instance
+ * @returns Handle to the model data
+ */
+typedef esp_nsn_data_t* (*esp_nsn_iface_op_create_t)(char *model_name);
+
+/**
+ * @brief Get the amount of samples that need to be passed to the process function
+ *
+ * Every noise suppression model processes a certain number of samples at the same time. This function
+ * can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the process function
+ */
+typedef int (*esp_nsn_iface_op_get_samp_chunksize_t)(esp_nsn_data_t *model);
+
+/**
+ * @brief Feed samples of an audio stream to the noise suppression model and get data after process.
+ *
+ *
+ * @param model The model object to query
+ * @param in_data An array of 16-bit signed audio samples. The array size used can be queried by the 
+ *        get_samp_chunksize function.
+ * @param out_data An array of 16-bit signed audio samples after process.
+ * @return The state of return.
+ */
+typedef int (*esp_nsn_iface_op_process_t)(esp_nsn_data_t *model, int16_t *in_data, int16_t *out_data);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the process function
+ *
+ * @param model The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_nsn_iface_op_get_samp_rate_t)(esp_nsn_data_t *model);
+
+/**
+ * @brief Destroy a noise suppression model
+ *
+ * @param model Model object to destroy
+ */
+typedef void (*esp_nsn_iface_op_destroy_t)(esp_nsn_data_t *model);
+
+
+/**
+ * This structure contains the functions used to do operations on a wake word detection model.
+ */
+typedef struct {
+    esp_nsn_iface_op_create_t create;
+    esp_nsn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_nsn_iface_op_process_t process;
+    esp_nsn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_nsn_iface_op_destroy_t destroy;
+} esp_nsn_iface_t;
--- a/include/esp32s2/esp_nsn_models.h
+++ b/include/esp32s2/esp_nsn_models.h
@ -0,0 +1,17 @@
+#pragma once
+
+#include "esp_nsn_iface.h"
+
+/*
+The prefix of nset
+Now there are nsnet1 and nsnet2
+*/
+#define ESP_NSNET_PREFIX "nsnet"
+
+/**
+ * @brief Get the nsnet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of multinet
+ */
+esp_nsn_iface_t *esp_nsnet_handle_from_name(char *model_name);
--- a/include/esp32s2/esp_speech_features.h
+++ b/include/esp32s2/esp_speech_features.h
@ -48,7 +48,7 @@ float *esp_win_func_init(char *win_type, float *window_data, int frame_length);

 float *esp_fftr(float *x, int nfft, void *fft_table);

-float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
+float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_handle);

 void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);

--- a/include/esp32s2/esp_sr_webrtc.h
+++ b/include/esp32s2/esp_sr_webrtc.h
@ -0,0 +1,84 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_WEBRTC_H_
+#define _ESP_WEBRTC_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include "esp_agc.h"
+#include "esp_log.h"
+#include "esp_ns.h"
+#include "sr_ringbuf.h"
+#include <stdint.h>
+
+#include "esp_heap_caps.h"
+
+typedef struct {
+    void *ns_handle;
+    void *agc_handle;
+    int frame_size;
+    int sample_rate;
+    int16_t *buff;
+    int16_t *out_data;
+    sr_ringbuf_handle_t rb;
+} webrtc_handle_t;
+
+/**
+ * @brief Creates an instance of webrtc.
+ *
+ * @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms.
+ *
+ * @param frame_length_ms    The length of the audio processing
+ * @param ns_mode            The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive
+ * @param agc_mode           The model of AGC
+ * @param agc_gain           The gain of AGC. default is 9
+ * @param agc_target_level   The target level of AGC. default is -3 dbfs
+ * @param sample_rate        The sample rate of the audio.
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of webrtc
+ */
+webrtc_handle_t *webrtc_create(
+    int frame_length_ms, int ns_mode, agc_mode_t agc_mode, int agc_gain, int agc_target_level, int sample_rate);
+
+/**
+ * @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression.
+ *
+ * @param handle        The instance of NS.
+ * @param in_data       An array of 16-bit signed audio samples.
+ * @param out_size      The sample size of output data
+ * @param enable_ns     Enable noise suppression
+ * @param enable_agc    Enable automatic gain control
+ *
+ * @return data after noise suppression
+ */
+int16_t *webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc);
+
+/**
+ * @brief Free the webrtc instance
+ *
+ * @param handle The instance of webrtc.
+ *
+ * @return None
+ *
+ */
+void webrtc_destroy(webrtc_handle_t *handle);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_NS_H_
--- a/include/esp32s2/esp_vad.h
+++ b/include/esp32s2/esp_vad.h
@ -0,0 +1,178 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_VAD_H_
+#define _ESP_VAD_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SAMPLE_RATE_HZ 16000   // Supports 32000, 16000, 8000
+#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms
+
+/**
+ * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
+ * restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
+ */
+typedef enum {
+    VAD_MODE_0 = 0, // Normal
+    VAD_MODE_1,     // Aggressive
+    VAD_MODE_2,     // Very Aggressive
+    VAD_MODE_3,     // Very Very Aggressive
+    VAD_MODE_4      // Very Very Very Aggressive
+} vad_mode_t;
+
+typedef enum {
+    VAD_SILENCE = 0,
+    VAD_SPEECH = 1,
+} vad_state_t;
+
+typedef struct vad_trigger_tag {
+    vad_state_t state;
+    unsigned int min_speech_len;
+    unsigned int noise_len;
+    unsigned int min_noise_len;
+    unsigned int speech_len;
+} vad_trigger_t;
+
+#define vad_MAX_LEN INT32_MAX - 1
+/**
+ * @brief Allocate wakenet trigger
+ *
+ * @param min_speech_len  Minimum frame number of speech duration
+ * @param min_noise_len   Minimum frame number of noise duration
+ *
+ * @return Trigger pointer
+ **/
+vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
+
+/**
+ * @brief Free wakenet trigger
+ **/
+void vad_trigger_free(vad_trigger_t *trigger);
+
+/**
+ * @brief Reset wakenet trigger
+ **/
+void vad_trigger_reset(vad_trigger_t *trigger);
+
+/**
+ * @brief detect activaty voice by trigger
+ **/
+vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
+
+typedef struct {
+    vad_trigger_t *trigger;
+    void *vad_inst;
+    int sample_rate;
+    int frame_size;
+} vad_handle_with_trigger_t;
+
+typedef vad_handle_with_trigger_t *vad_handle_t;
+
+// typedef vad_handle_tag * vad_handle_t;
+
+/**
+ * @brief Creates an instance to the VAD structure.
+ *
+ * @param vad_mode          Sets the VAD operating mode.
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of VAD
+ */
+vad_handle_t vad_create(vad_mode_t vad_mode);
+
+/**
+ * @brief Creates an instance to the VAD structure.
+ *
+ * @param vad_mode          Sets the VAD operating mode.
+ * @param sample_rate       Sample rate in Hz
+ * @param one_frame_ms      Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30.
+ * @param min_speech_ms     Minimum speech duration, unit is ms
+ * @param min_noise_ms      Minimum noise duration, unit is ms
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of VAD
+ */
+vad_handle_t vad_create_with_param(
+    vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
+
+/**
+ * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
+ *
+ * @param handle            The instance of VAD.
+ * @param data              An array of 16-bit signed audio samples.
+ * @param sample_rate_hz    The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
+ * @param one_frame_ms      The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
+ * @return
+ *         - VAD_SILENCE if no voice
+ *         - VAD_SPEECH  if voice is detected
+ *
+ */
+vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms);
+
+/**
+ * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
+ *
+ * @param handle            The instance of VAD.
+ * @param data              An array of 16-bit signed audio samples.
+ * @return
+ *         - VAD_SILENCE if no voice
+ *         - VAD_SPEECH  if voice is detected
+ *
+ */
+vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
+
+/**
+ * @brief Reset trigger state as Silence
+ *
+ * @param handle            The instance of VAD.
+ */
+void vad_reset_trigger(vad_handle_t handle);
+
+/**
+ * @brief Free the VAD instance
+ *
+ * @param inst The instance of VAD.
+ *
+ * @return None
+ *
+ */
+void vad_destroy(vad_handle_t inst);
+
+/*
+ * Programming Guide:
+ *
+ * @code{c}
+ * vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);     // Creates an instance to
+ * the VAD structure.
+ *
+ * while (1) {
+ *    //Use buffer to receive the audio data from MIC.
+ *    vad_state_t vad_state = vad_process(vad_inst, buffer);      // Feed samples to the VAD process and get the result.
+ * }
+ *
+ * vad_destroy(vad_inst);   // Free the VAD instance at the end of whole VAD process
+ *
+ * @endcode
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_VAD_H_
--- a/include/esp32s2/esp_vadn_iface.h
+++ b/include/esp32s2/esp_vadn_iface.h
@ -0,0 +1,164 @@
+#pragma once
+#include "esp_vad.h"
+#include "stdint.h"
+#include "dl_lib_convq_queue.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque model data container
+typedef struct model_iface_data_t model_iface_data_t;
+
+// /**
+//  * @brief The state of vad
+//  */
+// typedef enum {
+//     VAD_NOISE = -1,  // Noise
+//     VADNET_STATE_SILENCE = 0, // Silence
+//     VAD_SPEECH = 1   // Speech
+// } vad_state_t;
+
+/**
+ * @brief Easy function type to initialze a model instance with a detection mode
+ * and specified model name
+ *
+ * @param model_name  The specified model name
+ * @param mode        The voice activity detection mode
+ * @param channel_num The number of input audio channels
+ * @param min_speech_ms  The minimum duration of speech in ms to trigger vad
+ * speech
+ * @param min_noise_ms   The minimum duration of noise in ms to trigger vad
+ * noise
+ * @returns Handle to the model data
+ */
+typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)(
+    const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms);
+
+/**
+ * @brief Get the amount of samples that need to be passed to the detect
+ * function
+ *
+ * Every speech recognition model processes a certain number of samples at the
+ * same time. This function can be used to query that amount. Note that the
+ * returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the channel number of samples that need to be passed to the detect
+ * function
+ *
+ * Every speech recognition model processes a certain number of samples at the
+ * same time. This function can be used to query that amount. Note that the
+ * returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the detect function
+ *
+ * @param model The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
+
+/**
+ * @brief Set the detection threshold to manually abjust the probability
+ *
+ * @param model The model object to query
+ * @param det_treshold The threshold to trigger wake words, the range of
+ * det_threshold is 0.5~0.9999
+ * @return 0: setting failed, 1: setting success
+ */
+typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
+
+/**
+ * @brief Get the voice activity detection threshold
+ *
+ * @param model The model object to query
+ * @returns the detection threshold
+ */
+typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model);
+
+/**
+ * @brief Feed samples of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param samples An array of 16-bit signed audio samples. The array size used
+ * can be queried by the get_samp_chunksize function.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
+
+/**
+ * @brief Feed MFCC of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param cq An array of 16-bit MFCC.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq);
+
+/**
+ * @brief Get MFCC of an audio stream
+ *
+ * @param model The model object to query
+ * @return MFCC data
+ */
+typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the triggered channel index. Channel index starts from zero
+ *
+ * @param model The model object to query
+ * @return The channel index
+ */
+typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
+
+/**
+ * @brief Clean all states of model
+ *
+ * @param model The model object to query
+ */
+typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model);
+
+/**
+ * @brief Destroy a model object
+ *
+ * @param model Model object to destroy
+ */
+typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model);
+
+/**
+ * This structure contains the functions used to do operations on a voice
+ * activity detection model.
+ */
+typedef struct {
+    esp_vadn_iface_op_create_t create;
+    esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_vadn_iface_op_get_channel_num_t get_channel_num;
+    esp_vadn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_vadn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
+    esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
+    esp_vadn_iface_op_detect_t detect;
+    esp_vadn_iface_op_detect_mfcc_t detect_mfcc;
+    esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data;
+    esp_vadn_iface_op_clean_t clean;
+    esp_vadn_iface_op_destroy_t destroy;
+} esp_vadn_iface_t;
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32s2/esp_vadn_models.h
+++ b/include/esp32s2/esp_vadn_models.h
@ -0,0 +1,22 @@
+#pragma once
+#include "esp_vadn_iface.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The prefix of vadnet model name is used to filter all wakenet from availabel models.
+#define ESP_VADN_PREFIX "vadnet"
+
+/**
+ * @brief Get the wakenet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of wakenet
+ */
+const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name);
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32s2/esp_wn_iface.h
+++ b/include/esp32s2/esp_wn_iface.h
@ -29,6 +29,7 @@ typedef enum {
    DET_MODE_2CH_95 = 3,
    DET_MODE_3CH_90 = 4,
    DET_MODE_3CH_95 = 5,
+	DET_MODE_90_COPY_PARAMS = 6,       // Aggressive
 } det_mode_t;

 typedef struct {
@ -110,12 +111,21 @@ typedef char* (*esp_wn_iface_op_get_word_name_t)(model_iface_data_t *model, int
 * @brief Set the detection threshold to manually abjust the probability 
 *
 * @param model The model object to query
- * @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.5~0.9999
+ * @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.4~0.9999
 * @param word_index The index of wake word
 * @return 0: setting failed, 1: setting success
 */
 typedef int (*esp_wn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold, int word_index);

+/**
+ * @brief Reset the threshold to its initial state  
+ *
+ * @param model The model object to query
+ * @return 0: setting failed, 1: setting success
+ */
+typedef int (*esp_wn_iface_op_reset_det_threshold_t)(model_iface_data_t *model);
+
+
 /**
 * @brief Get the wake word detection threshold of different modes
 *
@ -200,6 +210,7 @@ typedef struct {
    esp_wn_iface_op_get_word_num_t get_word_num;
    esp_wn_iface_op_get_word_name_t get_word_name;
    esp_wn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_wn_iface_op_reset_det_threshold_t reset_det_threshold;
    esp_wn_iface_op_get_det_threshold_t get_det_threshold;
    esp_wn_iface_op_get_triggered_channel_t  get_triggered_channel;
    esp_wn_iface_op_get_vol_gain_t get_vol_gain;
--- a/include/esp32s2/esp_wn_models.h
+++ b/include/esp32s2/esp_wn_models.h
@ -11,7 +11,7 @@ extern "C" {
 /**
 * @brief Get the wakenet handle from model name
 *
- * @param model_name   The name of model 
+ * @param model_name   The name of model
 * @returns The handle of wakenet
 */
 const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
@ -19,10 +19,10 @@ const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
 /**
 * @brief Get the wake word name from model name
 *
- * @param model_name   The name of model 
+ * @param model_name   The name of model
 * @returns The wake word name, like "alexa","hilexin","xiaoaitongxue"
 */
-char* esp_wn_wakeword_from_name(const char *model_name);
+char *esp_wn_wakeword_from_name(const char *model_name);

 #ifdef __cplusplus
 }
--- a/include/esp32s2/flite_g2p.h
+++ b/include/esp32s2/flite_g2p.h
@ -0,0 +1,20 @@
+#ifndef __FLITE_G2P_H__
+#define __FLITE_G2P_H__
+
+typedef struct {
+    int num_phonemes;
+    int phoneme_size;
+    char **phonemes;
+} flite_g2p_result;
+
+void flite_g2p_result_free(flite_g2p_result *result);
+
+flite_g2p_result *flite_g2p_get_result(const char *grapheme);
+
+void flite_g2p_result_print_string(flite_g2p_result *result, int map_phonemes);
+
+char *flite_g2p_result_get_string(flite_g2p_result *result, int map_phonemes);
+
+char *flite_g2p(const char *graphemes, int map_phonemes);
+
+#endif
--- a/include/esp32s3/esp_mfcc_models.h
+++ b/include/esp32s3/esp_mfcc_models.h
@ -12,7 +12,7 @@ esp_mfcc_opts_t *get_mfcc_opts_wn9();
 /**
 * @brief Return basic opts used in wakenet9s
 **/
-esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
+esp_mfcc_opts_t *get_mfcc_opts(const char *win_type, bool use_power, int winstep_ms, int winlen_ms, int nfilter);

 /**
 * @brief Return basic opts for default kaldifeat
--- a/lib/esp32/libc_speech_features.a
+++ b/lib/esp32/libc_speech_features.a
--- a/lib/esp32/libesp_audio_front_end.a
+++ b/lib/esp32/libesp_audio_front_end.a
--- a/lib/esp32/libesp_audio_processor.a
+++ b/lib/esp32/libesp_audio_processor.a
--- a/lib/esp32/libmultinet.a
+++ b/lib/esp32/libmultinet.a
--- a/lib/esp32/libwakenet.a
+++ b/lib/esp32/libwakenet.a
--- a/lib/esp32c3/libc_speech_features.a
+++ b/lib/esp32c3/libc_speech_features.a
--- a/lib/esp32c3/libdl_lib.a
+++ b/lib/esp32c3/libdl_lib.a
--- a/lib/esp32c3/libesp_audio_front_end.a
+++ b/lib/esp32c3/libesp_audio_front_end.a
--- a/Show More
+++ b/Show More