feat: add afe aec interface

2025-09-15 15:28:44 +08:00 · 2025-02-14 17:57:09 +08:00 · 2025-02-14 17:57:09 +08:00 · b485bb4061
commit b485bb4061
parent a8b77d0795
45 changed files with 689 additions and 222 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -111,7 +111,9 @@ elseif(${IDF_TARGET} STREQUAL "esp32c5")
    
    component_compile_options(-ffast-math -O3 -Wno-error=format=-Wno-format)
    add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_processor.a")
+    add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_front_end.a")
    target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_processor)
+    target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_front_end)

 elseif((${IDF_TARGET} STREQUAL "esp32s2") OR (${IDF_TARGET} STREQUAL "esp32c3") OR (${IDF_TARGET} STREQUAL "esp32c6"))
 #Only support TTS on esp32s2, esp32c3 and esp32c6
--- a/include/esp32/esp_afe_aec.h
+++ b/include/esp32/esp_afe_aec.h
@ -0,0 +1,82 @@
+
+#ifndef _ESP_AFE_AEC_H_
+#define _ESP_AFE_AEC_H_
+
+
+#include "esp_afe_config.h"
+#include "esp_aec.h"
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    aec_handle_t* handle;
+    aec_mode_t mode;
+    afe_pcm_config_t pcm_config;
+    int frame_size;
+    int16_t  *data;
+}afe_aec_handle_t;
+
+
+/**
+ * @brief Creates an instance to the AEC structure. 
+ * 
+ * @warning Currently only support 1 microphone channel and 1 playback channe. 
+ * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
+ *
+ * The input format, same as afe config:
+ * M to represent the microphone channel
+ * R to represent the playback reference channel
+ * N to represent an unknown or unused channel
+ *
+ * For example, input_format="MMNR" indicates that the input data consists of four channels,
+ * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
+ *
+ * @param input_format     The input format
+ * @param filter_length    The length of filter. The larger the filter, the higher the CPU loading.
+ *                         Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
+ * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
+ * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+ *
+ * @return afe_config_t*  The default config of afe
+ */
+afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);
+
+
+/**
+ * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
+ * 
+ * @param inst        The instance of AEC.
+ * @param indata      Input audio data, format is define by input_format. Note indata will be modified in function call.
+ * @param outdata     Returns near-end signal with echo removed. 
+
+ * @return The bytes of outdata.
+ */
+size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
+
+/**
+ * @brief Get frame size of AEC (the samples of one frame)
+ * @param handle The instance of AEC.
+ * @return Frame size
+ */
+int afe_aec_get_chunksize(afe_aec_handle_t *handle);
+
+
+/**
+ * @brief Free the AEC instance
+ *
+ * @param inst The instance of AEC.
+ *
+ * @return None
+ *
+ */
+void afe_aec_destroy(afe_aec_handle_t *handel);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_AEC_H_
--- a/include/esp32/esp_afe_config.h
+++ b/include/esp32/esp_afe_config.h
@ -110,6 +110,8 @@ typedef struct {
    int vad_min_speech_ms;  // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
    int vad_min_noise_ms;   // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
                            // 1000 ms
+    int vad_delay_ms;       // The delay of the first speech frame in ms, default: 128 ms
+                            // If you find vad cache can not cover all speech, please increase this value.
    bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
    bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false

--- a/include/esp32/esp_afe_sr_iface.h
+++ b/include/esp32/esp_afe_sr_iface.h
@ -141,12 +141,12 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
 typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name);

 /**
- * @brief Enable VAD algorithm.
+ * @brief Reset one function/module/algorithm.
 *
 * @param afe          The AFE_SR object to query
- * @return             -1: fail, 0: disabled, 1: enabled
+ * @return             -1: fail, 1: success
 */
-typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe);
+typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe);

 /**
 * @brief Disable one function/module/algorithm.
@ -204,6 +204,7 @@ typedef struct {
    esp_afe_sr_iface_op_enable_func_t enable_se;
    esp_afe_sr_iface_op_disable_func_t disable_vad;
    esp_afe_sr_iface_op_enable_func_t enable_vad;
+    esp_afe_sr_iface_op_reset_op_t reset_vad;
    esp_afe_sr_iface_op_disable_func_t disable_ns;
    esp_afe_sr_iface_op_enable_func_t enable_ns;
    esp_afe_sr_iface_op_disable_func_t disable_agc;
--- a/include/esp32/esp_mfcc_iface.h
+++ b/include/esp32/esp_mfcc_iface.h
@ -1,6 +1,6 @@
 #pragma once
-#include <stdint.h>
 #include "esp_speech_features.h"
+#include <stdint.h>

 /*
 This describes an interface for a MFCC runner, that is, some kind of implementation that can be
@ -8,33 +8,30 @@ fed sample chunks and returns the MFCC cepstrum of those samples. This is an abs
 multiple implementations can be used.
 */

-
 typedef struct esp_mfcc_data_t esp_mfcc_data_t;

-
-//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please
-//refer to its documentation for details.
+// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features),
+// please refer to its documentation for details.
 typedef struct {
-    int winstep_ms;     // The step between successive windows in ms. (10)
-    int winlen_ms;      // The length of the analysis window in ms. (25)
-    int nch;            // The number of input channel
-    int numcep;         // The number of cepstrum to return
-    int nfilter;        // The number of filters in the filterbank
-    int nfft;           // The FFT size
-    int samp_freq;      // The sample-rate of the signal.
-    int low_freq;       // The lowest band edge of mel filters, in hz. (e.g. 0)
-    int high_freq;      // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
-    float preemph;      // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
-    char *win_type;     // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
-    bool append_energy; //　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
+    int winstep_ms; // The step between successive windows in ms. (10)
+    int winlen_ms;  // The length of the analysis window in ms. (25)
+    int nch;        // The number of input channel
+    int numcep;     // The number of cepstrum to return
+    int nfilter;    // The number of filters in the filterbank
+    int nfft;       // The FFT size
+    int samp_freq;  // The sample-rate of the signal.
+    int low_freq;   // The lowest band edge of mel filters, in hz. (e.g. 0)
+    int high_freq;  // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
+    float preemph;  // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
+    char *win_type; // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
+    bool append_energy; // 　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
    bool use_power;     // If true, use power of fft spectrum, else use magnitude of fft spectrum
    int use_log_fbank;  // 0: return fbank, 1:  return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
-    float log_epsilon;  // log epsilon. (e.g. 1e-7) 
+    float log_epsilon;  // log epsilon. (e.g. 1e-7)
    bool psram_first;   // Alloc memory from PSRAM first
-    bool remove_dc_offset;  // Whether to subtract mean of wave before FFT
+    bool remove_dc_offset; // Whether to subtract mean of wave before FFT
 } esp_mfcc_opts_t;

-
 /**
 * @brief Un-initialize and free a mfcc runner
 *
@ -54,13 +51,13 @@ typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
 * @param opt Options for the mfcc process
 * @return True if success, false on error.
 */
-typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
+typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);

 /**
 * @brief Run a mfcc iteration on frame by frame
 *
 * This will take a set of samples and return a ceptrum. Note that this may be pipelined:
- * an initial call to this function may return NULL and subsequent calls may return the 
+ * an initial call to this function may return NULL and subsequent calls may return the
 * cepstrum of previous calls.
 *
 * @param r The mfcc runner
@ -69,7 +66,7 @@ typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
 *         when done with this buffer. Note that some implementations require the buffer to be freed before another call
 *         to this function is done.
 */
-typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
+typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);

 /**
 * @brief Clean all state of mfcc handle
--- a/include/esp32/esp_mfcc_models.h
+++ b/include/esp32/esp_mfcc_models.h
@ -1,18 +1,16 @@
 #pragma once
 #include "esp_mfcc_iface.h"

-
 extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle

-
 /**
 * @brief Return basic opts used in wakenet9 & multinet5
 **/
 esp_mfcc_opts_t *get_mfcc_opts_wn9();

 /**
- * @brief Return basic opts for default kaldifeat 
- * 
+ * @brief Return basic opts for default kaldifeat
+ *
    opts->psram_first = true;
    opts->use_power = true;
    opts->use_log_fbank = 2;  // log(max(x, log_epsilon))
@ -37,4 +35,4 @@ esp_mfcc_opts_t *get_mfcc_opts_kaldi();
 /**
 * @brief Print mfcc opts
 **/
-void print_mfcc_opts(esp_mfcc_opts_t *opts);
+void print_mfcc_opts(esp_mfcc_opts_t *opts);
--- a/include/esp32/esp_speech_features.h
+++ b/include/esp32/esp_speech_features.h
@ -8,46 +8,45 @@
 #define M_2PI 6.283185307179586476925286766559005
 #endif

-typedef struct 
-{
+typedef struct {
    float *coeff;
    int *bank_pos;
    int nfilter;
 } esp_mel_filter_t;

-float* esp_mfcc_malloc(size_t size, bool from_psram);
+float *esp_mfcc_malloc(size_t size, bool from_psram);

 void esp_mfcc_free(void *ptr);

 /**
 * @brief Initialize FFT table
 * @warning For ESP-PLATFORM, use esp-dsp fft
- *          For Other platform, use kiss fft  
- * 
- * @param nfft  The input samples number 
+ *          For Other platform, use kiss fft
+ *
+ * @param nfft  The input samples number
 * @return fft-table
 **/
-void* esp_fft_init(int nfft);
+void *esp_fft_init(int nfft);

 /**
 * @brief Free FFT table
 * @warning For ESP-PLATFORM, use esp-dsp fft
- *          For Other platform, use kiss fft  
- * 
+ *          For Other platform, use kiss fft
+ *
 * @param fft_table  The fft table initialized by esp_fft_init
- * @param nfft       The input samples number 
+ * @param nfft       The input samples number
 * @return fft-table
 **/
 void esp_fft_deinit(void *fft_table, int nfft);

 /**
 * @brief Initial window function
- *        Currently support hanning, hamming, sine, povey, rectangular, 
+ *        Currently support hanning, hamming, sine, povey, rectangular,
 *        wn9(512-hanning to get wakenet9& multinet5 compatible)
 **/
-float *esp_win_func_init(char *win_type, float* window_data, int frame_length);
+float *esp_win_func_init(char *win_type, float *window_data, int frame_length);

-float* esp_fftr(float* x, int nfft, void *fft_table);
+float *esp_fftr(float *x, int nfft, void *fft_table);

 float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);

@ -55,10 +54,9 @@ void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);

 float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);

-esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, 
-                                      bool from_psram);
+esp_mel_filter_t *esp_mel_filter_init(
+    int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, bool from_psram);

 void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);

-float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, 
-                                float epsilon);
+float *esp_mel_dotprod_step(float *x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, float epsilon);
--- a/include/esp32/esp_vad.h
+++ b/include/esp32/esp_vad.h
@ -20,19 +20,19 @@
 extern "C" {
 #endif

-#define SAMPLE_RATE_HZ 16000      //Supports 32000, 16000, 8000
-#define VAD_FRAME_LENGTH_MS 30    //Supports 10ms, 20ms, 30ms
+#define SAMPLE_RATE_HZ 16000   // Supports 32000, 16000, 8000
+#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms

 /**
 * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
 * restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
 */
 typedef enum {
-    VAD_MODE_0 = 0,  // Normal
-    VAD_MODE_1,      // Aggressive
-    VAD_MODE_2,      // Very Aggressive
-    VAD_MODE_3,      // Very Very Aggressive
-    VAD_MODE_4       // Very Very Very Aggressive
+    VAD_MODE_0 = 0, // Normal
+    VAD_MODE_1,     // Aggressive
+    VAD_MODE_2,     // Very Aggressive
+    VAD_MODE_3,     // Very Very Aggressive
+    VAD_MODE_4      // Very Very Very Aggressive
 } vad_mode_t;

 typedef enum {
@ -51,10 +51,10 @@ typedef struct vad_trigger_tag {
 #define vad_MAX_LEN INT32_MAX - 1
 /**
 * @brief Allocate wakenet trigger
- * 
+ *
 * @param min_speech_len  Minimum frame number of speech duration
 * @param min_noise_len   Minimum frame number of noise duration
- * 
+ *
 * @return Trigger pointer
 **/
 vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
@ -74,19 +74,17 @@ void vad_trigger_reset(vad_trigger_t *trigger);
 **/
 vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);

-
 typedef struct {
    vad_trigger_t *trigger;
    void *vad_inst;
    int sample_rate;
    int frame_size;
-}vad_handle_with_trigger_t;
+} vad_handle_with_trigger_t;

-typedef vad_handle_with_trigger_t* vad_handle_t;
+typedef vad_handle_with_trigger_t *vad_handle_t;

 // typedef vad_handle_tag * vad_handle_t;

-
 /**
 * @brief Creates an instance to the VAD structure.
 *
@ -110,7 +108,8 @@ vad_handle_t vad_create(vad_mode_t vad_mode);
 *         - NULL: Create failed
 *         - Others: The instance of VAD
 */
-vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len);
+vad_handle_t vad_create_with_param(
+    vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);

 /**
 * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
@ -138,6 +137,13 @@ vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz,
 */
 vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);

+/**
+ * @brief Reset trigger state as Silence
+ *
+ * @param handle            The instance of VAD.
+ */
+void vad_reset_trigger(vad_handle_t handle);
+
 /**
 * @brief Free the VAD instance
 *
@ -149,20 +155,21 @@ vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
 void vad_destroy(vad_handle_t inst);

 /*
-* Programming Guide:
-*
-* @code{c}
-* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);     // Creates an instance to the VAD structure.
-*
-* while (1) {
-*    //Use buffer to receive the audio data from MIC.
-*    vad_state_t vad_state = vad_process(vad_inst, buffer);      // Feed samples to the VAD process and get the result.
-* }
-*
-* vad_destroy(vad_inst);   // Free the VAD instance at the end of whole VAD process
-*
-* @endcode
-*/
+ * Programming Guide:
+ *
+ * @code{c}
+ * vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);     // Creates an instance to
+ * the VAD structure.
+ *
+ * while (1) {
+ *    //Use buffer to receive the audio data from MIC.
+ *    vad_state_t vad_state = vad_process(vad_inst, buffer);      // Feed samples to the VAD process and get the result.
+ * }
+ *
+ * vad_destroy(vad_inst);   // Free the VAD instance at the end of whole VAD process
+ *
+ * @endcode
+ */

 #ifdef __cplusplus
 }
--- a/include/esp32c5/esp_afe_aec.h
+++ b/include/esp32c5/esp_afe_aec.h
@ -0,0 +1,82 @@
+
+#ifndef _ESP_AFE_AEC_H_
+#define _ESP_AFE_AEC_H_
+
+
+#include "esp_afe_config.h"
+#include "esp_aec.h"
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    aec_handle_t* handle;
+    aec_mode_t mode;
+    afe_pcm_config_t pcm_config;
+    int frame_size;
+    int16_t  *data;
+}afe_aec_handle_t;
+
+
+/**
+ * @brief Creates an instance to the AEC structure. 
+ * 
+ * @warning Currently only support 1 microphone channel and 1 playback channe. 
+ * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
+ *
+ * The input format, same as afe config:
+ * M to represent the microphone channel
+ * R to represent the playback reference channel
+ * N to represent an unknown or unused channel
+ *
+ * For example, input_format="MMNR" indicates that the input data consists of four channels,
+ * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
+ *
+ * @param input_format     The input format
+ * @param filter_length    The length of filter. The larger the filter, the higher the CPU loading.
+ *                         Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
+ * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
+ * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+ *
+ * @return afe_config_t*  The default config of afe
+ */
+afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);
+
+
+/**
+ * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
+ * 
+ * @param inst        The instance of AEC.
+ * @param indata      Input audio data, format is define by input_format. Note indata will be modified in function call.
+ * @param outdata     Returns near-end signal with echo removed. 
+
+ * @return The bytes of outdata.
+ */
+size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
+
+/**
+ * @brief Get frame size of AEC (the samples of one frame)
+ * @param handle The instance of AEC.
+ * @return Frame size
+ */
+int afe_aec_get_chunksize(afe_aec_handle_t *handle);
+
+
+/**
+ * @brief Free the AEC instance
+ *
+ * @param inst The instance of AEC.
+ *
+ * @return None
+ *
+ */
+void afe_aec_destroy(afe_aec_handle_t *handel);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_AEC_H_
--- a/include/esp32c5/esp_afe_config.h
+++ b/include/esp32c5/esp_afe_config.h
@ -0,0 +1,69 @@
+#pragma once
+#include "esp_aec.h"
+#include "stdbool.h"
+#include "stdint.h"
+#include "stdlib.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// AFE: Audio Front-End
+// SR:  Speech Recognition
+// VC:  Voice Communication
+
+// Set AFE_SR mode
+typedef enum {
+    SR_MODE_LOW_COST = 0,  // Deprecated, please use afe_mode_t, AFE mode: low cost mode
+    SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode
+} afe_sr_mode_t;
+
+// Set AFE mode
+typedef enum {
+    AFE_MODE_LOW_COST = 0,  // AFE mode: low cost mode
+    AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
+} afe_mode_t;
+
+// Set AFE type
+typedef enum {
+    AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
+    AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
+} afe_type_t;
+
+typedef enum {
+    AFE_MEMORY_ALLOC_MORE_INTERNAL = 1,          // malloc with more internal ram
+    AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
+    AFE_MEMORY_ALLOC_MORE_PSRAM = 3              // malloc with more psram
+} afe_memory_alloc_mode_t;
+
+typedef enum {
+    AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
+    AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
+    AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
+    AFE_MN_PEAK_NO_AGC = 0,      // There is no agc gain
+} afe_mn_peak_agc_mode_t;
+
+typedef struct {
+    int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
+    int mic_num;      // microphone channel number
+    uint8_t *mic_ids; // microphone channel indices
+    int ref_num;      // playback reference channel number
+    uint8_t *ref_ids; // playback reference channel indices
+    int sample_rate;  // sample rate of audio
+} afe_pcm_config_t;
+
+typedef enum {
+    AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
+    AFE_NS_MODE_NET = 1,    // please use model name of NSNET
+} afe_ns_mode_t;
+
+typedef enum {
+    AFE_AGC_MODE_WEBRTC = 0,  // WEBRTC AGC
+    AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
+} afe_agc_mode_t;
+
+
+#ifdef __cplusplus
+}
+#endif
+
--- a/include/esp32p4/esp_afe_aec.h
+++ b/include/esp32p4/esp_afe_aec.h
@ -0,0 +1,82 @@
+
+#ifndef _ESP_AFE_AEC_H_
+#define _ESP_AFE_AEC_H_
+
+
+#include "esp_afe_config.h"
+#include "esp_aec.h"
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    aec_handle_t* handle;
+    aec_mode_t mode;
+    afe_pcm_config_t pcm_config;
+    int frame_size;
+    int16_t  *data;
+}afe_aec_handle_t;
+
+
+/**
+ * @brief Creates an instance to the AEC structure. 
+ * 
+ * @warning Currently only support 1 microphone channel and 1 playback channe. 
+ * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
+ *
+ * The input format, same as afe config:
+ * M to represent the microphone channel
+ * R to represent the playback reference channel
+ * N to represent an unknown or unused channel
+ *
+ * For example, input_format="MMNR" indicates that the input data consists of four channels,
+ * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
+ *
+ * @param input_format     The input format
+ * @param filter_length    The length of filter. The larger the filter, the higher the CPU loading.
+ *                         Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
+ * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
+ * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+ *
+ * @return afe_config_t*  The default config of afe
+ */
+afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);
+
+
+/**
+ * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
+ * 
+ * @param inst        The instance of AEC.
+ * @param indata      Input audio data, format is define by input_format. Note indata will be modified in function call.
+ * @param outdata     Returns near-end signal with echo removed. 
+
+ * @return The bytes of outdata.
+ */
+size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
+
+/**
+ * @brief Get frame size of AEC (the samples of one frame)
+ * @param handle The instance of AEC.
+ * @return Frame size
+ */
+int afe_aec_get_chunksize(afe_aec_handle_t *handle);
+
+
+/**
+ * @brief Free the AEC instance
+ *
+ * @param inst The instance of AEC.
+ *
+ * @return None
+ *
+ */
+void afe_aec_destroy(afe_aec_handle_t *handel);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_AEC_H_
--- a/include/esp32p4/esp_mfcc_iface.h
+++ b/include/esp32p4/esp_mfcc_iface.h
@ -1,6 +1,6 @@
 #pragma once
-#include <stdint.h>
 #include "esp_speech_features.h"
+#include <stdint.h>

 /*
 This describes an interface for a MFCC runner, that is, some kind of implementation that can be
@ -8,33 +8,30 @@ fed sample chunks and returns the MFCC cepstrum of those samples. This is an abs
 multiple implementations can be used.
 */

-
 typedef struct esp_mfcc_data_t esp_mfcc_data_t;

-
-//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please
-//refer to its documentation for details.
+// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features),
+// please refer to its documentation for details.
 typedef struct {
-    int winstep_ms;     // The step between successive windows in ms. (10)
-    int winlen_ms;      // The length of the analysis window in ms. (25)
-    int nch;            // The number of input channel
-    int numcep;         // The number of cepstrum to return
-    int nfilter;        // The number of filters in the filterbank
-    int nfft;           // The FFT size
-    int samp_freq;      // The sample-rate of the signal.
-    int low_freq;       // The lowest band edge of mel filters, in hz. (e.g. 0)
-    int high_freq;      // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
-    float preemph;      // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
-    char *win_type;     // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
-    bool append_energy; //　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
+    int winstep_ms; // The step between successive windows in ms. (10)
+    int winlen_ms;  // The length of the analysis window in ms. (25)
+    int nch;        // The number of input channel
+    int numcep;     // The number of cepstrum to return
+    int nfilter;    // The number of filters in the filterbank
+    int nfft;       // The FFT size
+    int samp_freq;  // The sample-rate of the signal.
+    int low_freq;   // The lowest band edge of mel filters, in hz. (e.g. 0)
+    int high_freq;  // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
+    float preemph;  // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
+    char *win_type; // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
+    bool append_energy; // 　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
    bool use_power;     // If true, use power of fft spectrum, else use magnitude of fft spectrum
    int use_log_fbank;  // 0: return fbank, 1:  return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
-    float log_epsilon;  // log epsilon. (e.g. 1e-7) 
+    float log_epsilon;  // log epsilon. (e.g. 1e-7)
    bool psram_first;   // Alloc memory from PSRAM first
-    bool remove_dc_offset;  // Whether to subtract mean of wave before FFT
+    bool remove_dc_offset; // Whether to subtract mean of wave before FFT
 } esp_mfcc_opts_t;

-
 /**
 * @brief Un-initialize and free a mfcc runner
 *
@ -54,13 +51,13 @@ typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
 * @param opt Options for the mfcc process
 * @return True if success, false on error.
 */
-typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
+typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);

 /**
 * @brief Run a mfcc iteration on frame by frame
 *
 * This will take a set of samples and return a ceptrum. Note that this may be pipelined:
- * an initial call to this function may return NULL and subsequent calls may return the 
+ * an initial call to this function may return NULL and subsequent calls may return the
 * cepstrum of previous calls.
 *
 * @param r The mfcc runner
@ -69,7 +66,7 @@ typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
 *         when done with this buffer. Note that some implementations require the buffer to be freed before another call
 *         to this function is done.
 */
-typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
+typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);

 /**
 * @brief Clean all state of mfcc handle
--- a/include/esp32p4/esp_mfcc_models.h
+++ b/include/esp32p4/esp_mfcc_models.h
@ -1,18 +1,16 @@
 #pragma once
 #include "esp_mfcc_iface.h"

-
 extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle

-
 /**
 * @brief Return basic opts used in wakenet9 & multinet5
 **/
 esp_mfcc_opts_t *get_mfcc_opts_wn9();

 /**
- * @brief Return basic opts for default kaldifeat 
- * 
+ * @brief Return basic opts for default kaldifeat
+ *
    opts->psram_first = true;
    opts->use_power = true;
    opts->use_log_fbank = 2;  // log(max(x, log_epsilon))
@ -37,4 +35,4 @@ esp_mfcc_opts_t *get_mfcc_opts_kaldi();
 /**
 * @brief Print mfcc opts
 **/
-void print_mfcc_opts(esp_mfcc_opts_t *opts);
+void print_mfcc_opts(esp_mfcc_opts_t *opts);
--- a/include/esp32p4/esp_speech_features.h
+++ b/include/esp32p4/esp_speech_features.h
@ -8,46 +8,45 @@
 #define M_2PI 6.283185307179586476925286766559005
 #endif

-typedef struct 
-{
+typedef struct {
    float *coeff;
    int *bank_pos;
    int nfilter;
 } esp_mel_filter_t;

-float* esp_mfcc_malloc(size_t size, bool from_psram);
+float *esp_mfcc_malloc(size_t size, bool from_psram);

 void esp_mfcc_free(void *ptr);

 /**
 * @brief Initialize FFT table
 * @warning For ESP-PLATFORM, use esp-dsp fft
- *          For Other platform, use kiss fft  
- * 
- * @param nfft  The input samples number 
+ *          For Other platform, use kiss fft
+ *
+ * @param nfft  The input samples number
 * @return fft-table
 **/
-void* esp_fft_init(int nfft);
+void *esp_fft_init(int nfft);

 /**
 * @brief Free FFT table
 * @warning For ESP-PLATFORM, use esp-dsp fft
- *          For Other platform, use kiss fft  
- * 
+ *          For Other platform, use kiss fft
+ *
 * @param fft_table  The fft table initialized by esp_fft_init
- * @param nfft       The input samples number 
+ * @param nfft       The input samples number
 * @return fft-table
 **/
 void esp_fft_deinit(void *fft_table, int nfft);

 /**
 * @brief Initial window function
- *        Currently support hanning, hamming, sine, povey, rectangular, 
+ *        Currently support hanning, hamming, sine, povey, rectangular,
 *        wn9(512-hanning to get wakenet9& multinet5 compatible)
 **/
-float *esp_win_func_init(char *win_type, float* window_data, int frame_length);
+float *esp_win_func_init(char *win_type, float *window_data, int frame_length);

-float* esp_fftr(float* x, int nfft, void *fft_table);
+float *esp_fftr(float *x, int nfft, void *fft_table);

 float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);

@ -55,10 +54,9 @@ void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);

 float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);

-esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, 
-                                      bool from_psram);
+esp_mel_filter_t *esp_mel_filter_init(
+    int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, bool from_psram);

 void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);

-float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, 
-                                float epsilon);
+float *esp_mel_dotprod_step(float *x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, float epsilon);
--- a/include/esp32p4/esp_vad.h
+++ b/include/esp32p4/esp_vad.h
@ -20,19 +20,19 @@
 extern "C" {
 #endif

-#define SAMPLE_RATE_HZ 16000      //Supports 32000, 16000, 8000
-#define VAD_FRAME_LENGTH_MS 30    //Supports 10ms, 20ms, 30ms
+#define SAMPLE_RATE_HZ 16000   // Supports 32000, 16000, 8000
+#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms

 /**
 * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
 * restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
 */
 typedef enum {
-    VAD_MODE_0 = 0,  // Normal
-    VAD_MODE_1,      // Aggressive
-    VAD_MODE_2,      // Very Aggressive
-    VAD_MODE_3,      // Very Very Aggressive
-    VAD_MODE_4       // Very Very Very Aggressive
+    VAD_MODE_0 = 0, // Normal
+    VAD_MODE_1,     // Aggressive
+    VAD_MODE_2,     // Very Aggressive
+    VAD_MODE_3,     // Very Very Aggressive
+    VAD_MODE_4      // Very Very Very Aggressive
 } vad_mode_t;

 typedef enum {
@ -51,10 +51,10 @@ typedef struct vad_trigger_tag {
 #define vad_MAX_LEN INT32_MAX - 1
 /**
 * @brief Allocate wakenet trigger
- * 
+ *
 * @param min_speech_len  Minimum frame number of speech duration
 * @param min_noise_len   Minimum frame number of noise duration
- * 
+ *
 * @return Trigger pointer
 **/
 vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
@ -74,19 +74,17 @@ void vad_trigger_reset(vad_trigger_t *trigger);
 **/
 vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);

-
 typedef struct {
    vad_trigger_t *trigger;
    void *vad_inst;
    int sample_rate;
    int frame_size;
-}vad_handle_with_trigger_t;
+} vad_handle_with_trigger_t;

-typedef vad_handle_with_trigger_t* vad_handle_t;
+typedef vad_handle_with_trigger_t *vad_handle_t;

 // typedef vad_handle_tag * vad_handle_t;

-
 /**
 * @brief Creates an instance to the VAD structure.
 *
@ -110,7 +108,8 @@ vad_handle_t vad_create(vad_mode_t vad_mode);
 *         - NULL: Create failed
 *         - Others: The instance of VAD
 */
-vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
+vad_handle_t vad_create_with_param(
+    vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);

 /**
 * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
@ -156,20 +155,21 @@ void vad_reset_trigger(vad_handle_t handle);
 void vad_destroy(vad_handle_t inst);

 /*
-* Programming Guide:
-*
-* @code{c}
-* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);     // Creates an instance to the VAD structure.
-*
-* while (1) {
-*    //Use buffer to receive the audio data from MIC.
-*    vad_state_t vad_state = vad_process(vad_inst, buffer);      // Feed samples to the VAD process and get the result.
-* }
-*
-* vad_destroy(vad_inst);   // Free the VAD instance at the end of whole VAD process
-*
-* @endcode
-*/
+ * Programming Guide:
+ *
+ * @code{c}
+ * vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);     // Creates an instance to
+ * the VAD structure.
+ *
+ * while (1) {
+ *    //Use buffer to receive the audio data from MIC.
+ *    vad_state_t vad_state = vad_process(vad_inst, buffer);      // Feed samples to the VAD process and get the result.
+ * }
+ *
+ * vad_destroy(vad_inst);   // Free the VAD instance at the end of whole VAD process
+ *
+ * @endcode
+ */

 #ifdef __cplusplus
 }
--- a/include/esp32s3/esp_afe_aec.h
+++ b/include/esp32s3/esp_afe_aec.h
@ -0,0 +1,82 @@
+
+#ifndef _ESP_AFE_AEC_H_
+#define _ESP_AFE_AEC_H_
+
+
+#include "esp_afe_config.h"
+#include "esp_aec.h"
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+    aec_handle_t* handle;
+    aec_mode_t mode;
+    afe_pcm_config_t pcm_config;
+    int frame_size;
+    int16_t  *data;
+}afe_aec_handle_t;
+
+
+/**
+ * @brief Creates an instance to the AEC structure. 
+ * 
+ * @warning Currently only support 1 microphone channel and 1 playback channe. 
+ * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
+ *
+ * The input format, same as afe config:
+ * M to represent the microphone channel
+ * R to represent the playback reference channel
+ * N to represent an unknown or unused channel
+ *
+ * For example, input_format="MMNR" indicates that the input data consists of four channels,
+ * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
+ *
+ * @param input_format     The input format
+ * @param filter_length    The length of filter. The larger the filter, the higher the CPU loading.
+ *                         Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
+ * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
+ * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+ *
+ * @return afe_config_t*  The default config of afe
+ */
+afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);
+
+
+/**
+ * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
+ * 
+ * @param inst        The instance of AEC.
+ * @param indata      Input audio data, format is define by input_format. Note indata will be modified in function call.
+ * @param outdata     Returns near-end signal with echo removed. 
+
+ * @return The bytes of outdata.
+ */
+size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
+
+/**
+ * @brief Get frame size of AEC (the samples of one frame)
+ * @param handle The instance of AEC.
+ * @return Frame size
+ */
+int afe_aec_get_chunksize(afe_aec_handle_t *handle);
+
+
+/**
+ * @brief Free the AEC instance
+ *
+ * @param inst The instance of AEC.
+ *
+ * @return None
+ *
+ */
+void afe_aec_destroy(afe_aec_handle_t *handel);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_AEC_H_
--- a/include/esp32s3/esp_mfcc_iface.h
+++ b/include/esp32s3/esp_mfcc_iface.h
@ -1,6 +1,6 @@
 #pragma once
-#include <stdint.h>
 #include "esp_speech_features.h"
+#include <stdint.h>

 /*
 This describes an interface for a MFCC runner, that is, some kind of implementation that can be
@ -8,33 +8,30 @@ fed sample chunks and returns the MFCC cepstrum of those samples. This is an abs
 multiple implementations can be used.
 */

-
 typedef struct esp_mfcc_data_t esp_mfcc_data_t;

-
-//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please
-//refer to its documentation for details.
+// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features),
+// please refer to its documentation for details.
 typedef struct {
-    int winstep_ms;     // The step between successive windows in ms. (10)
-    int winlen_ms;      // The length of the analysis window in ms. (25)
-    int nch;            // The number of input channel
-    int numcep;         // The number of cepstrum to return
-    int nfilter;        // The number of filters in the filterbank
-    int nfft;           // The FFT size
-    int samp_freq;      // The sample-rate of the signal.
-    int low_freq;       // The lowest band edge of mel filters, in hz. (e.g. 0)
-    int high_freq;      // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
-    float preemph;      // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
-    char *win_type;     // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
-    bool append_energy; //　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
+    int winstep_ms; // The step between successive windows in ms. (10)
+    int winlen_ms;  // The length of the analysis window in ms. (25)
+    int nch;        // The number of input channel
+    int numcep;     // The number of cepstrum to return
+    int nfilter;    // The number of filters in the filterbank
+    int nfft;       // The FFT size
+    int samp_freq;  // The sample-rate of the signal.
+    int low_freq;   // The lowest band edge of mel filters, in hz. (e.g. 0)
+    int high_freq;  // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
+    float preemph;  // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
+    char *win_type; // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
+    bool append_energy; // 　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
    bool use_power;     // If true, use power of fft spectrum, else use magnitude of fft spectrum
    int use_log_fbank;  // 0: return fbank, 1:  return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
-    float log_epsilon;  // log epsilon. (e.g. 1e-7) 
+    float log_epsilon;  // log epsilon. (e.g. 1e-7)
    bool psram_first;   // Alloc memory from PSRAM first
-    bool remove_dc_offset;  // Whether to subtract mean of wave before FFT
+    bool remove_dc_offset; // Whether to subtract mean of wave before FFT
 } esp_mfcc_opts_t;

-
 /**
 * @brief Un-initialize and free a mfcc runner
 *
@ -54,13 +51,13 @@ typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
 * @param opt Options for the mfcc process
 * @return True if success, false on error.
 */
-typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
+typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);

 /**
 * @brief Run a mfcc iteration on frame by frame
 *
 * This will take a set of samples and return a ceptrum. Note that this may be pipelined:
- * an initial call to this function may return NULL and subsequent calls may return the 
+ * an initial call to this function may return NULL and subsequent calls may return the
 * cepstrum of previous calls.
 *
 * @param r The mfcc runner
@ -69,7 +66,7 @@ typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
 *         when done with this buffer. Note that some implementations require the buffer to be freed before another call
 *         to this function is done.
 */
-typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
+typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);

 /**
 * @brief Clean all state of mfcc handle
--- a/include/esp32s3/esp_mfcc_models.h
+++ b/include/esp32s3/esp_mfcc_models.h
@ -1,18 +1,16 @@
 #pragma once
 #include "esp_mfcc_iface.h"

-
 extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle

-
 /**
 * @brief Return basic opts used in wakenet9 & multinet5
 **/
 esp_mfcc_opts_t *get_mfcc_opts_wn9();

 /**
- * @brief Return basic opts for default kaldifeat 
- * 
+ * @brief Return basic opts for default kaldifeat
+ *
    opts->psram_first = true;
    opts->use_power = true;
    opts->use_log_fbank = 2;  // log(max(x, log_epsilon))
@ -37,4 +35,4 @@ esp_mfcc_opts_t *get_mfcc_opts_kaldi();
 /**
 * @brief Print mfcc opts
 **/
-void print_mfcc_opts(esp_mfcc_opts_t *opts);
+void print_mfcc_opts(esp_mfcc_opts_t *opts);
--- a/include/esp32s3/esp_speech_features.h
+++ b/include/esp32s3/esp_speech_features.h
@ -8,46 +8,45 @@
 #define M_2PI 6.283185307179586476925286766559005
 #endif

-typedef struct 
-{
+typedef struct {
    float *coeff;
    int *bank_pos;
    int nfilter;
 } esp_mel_filter_t;

-float* esp_mfcc_malloc(size_t size, bool from_psram);
+float *esp_mfcc_malloc(size_t size, bool from_psram);

 void esp_mfcc_free(void *ptr);

 /**
 * @brief Initialize FFT table
 * @warning For ESP-PLATFORM, use esp-dsp fft
- *          For Other platform, use kiss fft  
- * 
- * @param nfft  The input samples number 
+ *          For Other platform, use kiss fft
+ *
+ * @param nfft  The input samples number
 * @return fft-table
 **/
-void* esp_fft_init(int nfft);
+void *esp_fft_init(int nfft);

 /**
 * @brief Free FFT table
 * @warning For ESP-PLATFORM, use esp-dsp fft
- *          For Other platform, use kiss fft  
- * 
+ *          For Other platform, use kiss fft
+ *
 * @param fft_table  The fft table initialized by esp_fft_init
- * @param nfft       The input samples number 
+ * @param nfft       The input samples number
 * @return fft-table
 **/
 void esp_fft_deinit(void *fft_table, int nfft);

 /**
 * @brief Initial window function
- *        Currently support hanning, hamming, sine, povey, rectangular, 
+ *        Currently support hanning, hamming, sine, povey, rectangular,
 *        wn9(512-hanning to get wakenet9& multinet5 compatible)
 **/
-float *esp_win_func_init(char *win_type, float* window_data, int frame_length);
+float *esp_win_func_init(char *win_type, float *window_data, int frame_length);

-float* esp_fftr(float* x, int nfft, void *fft_table);
+float *esp_fftr(float *x, int nfft, void *fft_table);

 float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);

@ -55,10 +54,9 @@ void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);

 float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);

-esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, 
-                                      bool from_psram);
+esp_mel_filter_t *esp_mel_filter_init(
+    int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, bool from_psram);

 void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);

-float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, 
-                                float epsilon);
+float *esp_mel_dotprod_step(float *x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, float epsilon);
--- a/include/esp32s3/esp_vad.h
+++ b/include/esp32s3/esp_vad.h
@ -20,19 +20,19 @@
 extern "C" {
 #endif

-#define SAMPLE_RATE_HZ 16000      //Supports 32000, 16000, 8000
-#define VAD_FRAME_LENGTH_MS 30    //Supports 10ms, 20ms, 30ms
+#define SAMPLE_RATE_HZ 16000   // Supports 32000, 16000, 8000
+#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms

 /**
 * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
 * restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
 */
 typedef enum {
-    VAD_MODE_0 = 0,  // Normal
-    VAD_MODE_1,      // Aggressive
-    VAD_MODE_2,      // Very Aggressive
-    VAD_MODE_3,      // Very Very Aggressive
-    VAD_MODE_4       // Very Very Very Aggressive
+    VAD_MODE_0 = 0, // Normal
+    VAD_MODE_1,     // Aggressive
+    VAD_MODE_2,     // Very Aggressive
+    VAD_MODE_3,     // Very Very Aggressive
+    VAD_MODE_4      // Very Very Very Aggressive
 } vad_mode_t;

 typedef enum {
@ -51,10 +51,10 @@ typedef struct vad_trigger_tag {
 #define vad_MAX_LEN INT32_MAX - 1
 /**
 * @brief Allocate wakenet trigger
- * 
+ *
 * @param min_speech_len  Minimum frame number of speech duration
 * @param min_noise_len   Minimum frame number of noise duration
- * 
+ *
 * @return Trigger pointer
 **/
 vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
@ -74,19 +74,17 @@ void vad_trigger_reset(vad_trigger_t *trigger);
 **/
 vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);

-
 typedef struct {
    vad_trigger_t *trigger;
    void *vad_inst;
    int sample_rate;
    int frame_size;
-}vad_handle_with_trigger_t;
+} vad_handle_with_trigger_t;

-typedef vad_handle_with_trigger_t* vad_handle_t;
+typedef vad_handle_with_trigger_t *vad_handle_t;

 // typedef vad_handle_tag * vad_handle_t;

-
 /**
 * @brief Creates an instance to the VAD structure.
 *
@ -110,7 +108,8 @@ vad_handle_t vad_create(vad_mode_t vad_mode);
 *         - NULL: Create failed
 *         - Others: The instance of VAD
 */
-vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
+vad_handle_t vad_create_with_param(
+    vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);

 /**
 * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
@ -156,20 +155,21 @@ void vad_reset_trigger(vad_handle_t handle);
 void vad_destroy(vad_handle_t inst);

 /*
-* Programming Guide:
-*
-* @code{c}
-* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);     // Creates an instance to the VAD structure.
-*
-* while (1) {
-*    //Use buffer to receive the audio data from MIC.
-*    vad_state_t vad_state = vad_process(vad_inst, buffer);      // Feed samples to the VAD process and get the result.
-* }
-*
-* vad_destroy(vad_inst);   // Free the VAD instance at the end of whole VAD process
-*
-* @endcode
-*/
+ * Programming Guide:
+ *
+ * @code{c}
+ * vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);     // Creates an instance to
+ * the VAD structure.
+ *
+ * while (1) {
+ *    //Use buffer to receive the audio data from MIC.
+ *    vad_state_t vad_state = vad_process(vad_inst, buffer);      // Feed samples to the VAD process and get the result.
+ * }
+ *
+ * vad_destroy(vad_inst);   // Free the VAD instance at the end of whole VAD process
+ *
+ * @endcode
+ */

 #ifdef __cplusplus
 }
--- a/lib/esp32/libc_speech_features.a
+++ b/lib/esp32/libc_speech_features.a
--- a/lib/esp32/libesp_audio_front_end.a
+++ b/lib/esp32/libesp_audio_front_end.a
--- a/lib/esp32/libesp_audio_processor.a
+++ b/lib/esp32/libesp_audio_processor.a
--- a/lib/esp32/libmultinet.a
+++ b/lib/esp32/libmultinet.a
--- a/lib/esp32/libwakenet.a
+++ b/lib/esp32/libwakenet.a
--- a/lib/esp32c5/libesp_audio_front_end.a
+++ b/lib/esp32c5/libesp_audio_front_end.a
--- a/lib/esp32p4/libc_speech_features.a
+++ b/lib/esp32p4/libc_speech_features.a
--- a/lib/esp32p4/libesp_audio_front_end.a
+++ b/lib/esp32p4/libesp_audio_front_end.a
--- a/lib/esp32p4/libesp_audio_processor.a
+++ b/lib/esp32p4/libesp_audio_processor.a
--- a/lib/esp32p4/libmultinet.a
+++ b/lib/esp32p4/libmultinet.a
--- a/lib/esp32p4/libvadnet.a
+++ b/lib/esp32p4/libvadnet.a
--- a/lib/esp32p4/libwakenet.a
+++ b/lib/esp32p4/libwakenet.a
--- a/lib/esp32s3/libc_speech_features.a
+++ b/lib/esp32s3/libc_speech_features.a
--- a/lib/esp32s3/libdl_lib.a
+++ b/lib/esp32s3/libdl_lib.a
--- a/lib/esp32s3/libesp_audio_front_end.a
+++ b/lib/esp32s3/libesp_audio_front_end.a
--- a/lib/esp32s3/libesp_audio_processor.a
+++ b/lib/esp32s3/libesp_audio_processor.a
--- a/lib/esp32s3/libflite_g2p.a
+++ b/lib/esp32s3/libflite_g2p.a
--- a/lib/esp32s3/libfst.a
+++ b/lib/esp32s3/libfst.a
--- a/lib/esp32s3/libhufzip.a
+++ b/lib/esp32s3/libhufzip.a
--- a/lib/esp32s3/libmultinet.a
+++ b/lib/esp32s3/libmultinet.a
--- a/lib/esp32s3/libnsnet.a
+++ b/lib/esp32s3/libnsnet.a
--- a/lib/esp32s3/libvadnet.a
+++ b/lib/esp32s3/libvadnet.a
--- a/lib/esp32s3/libwakenet.a
+++ b/lib/esp32s3/libwakenet.a
--- a/test_apps/esp-sr/main/test_afe.cpp
+++ b/test_apps/esp-sr/main/test_afe.cpp
@ -18,6 +18,7 @@
 #include "esp_wn_models.h"
 #include "esp_afe_sr_models.h"
 #include "dl_lib_convq_queue.h"
+#include "esp_afe_aec.h"
 #include <sys/time.h>

 #if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
@ -297,4 +298,25 @@ TEST_CASE("afe performance test (2ch)", "[afe_perf]")
        afe_config_free(afe_config);
    }
    esp_srmodel_deinit(models);
+}
+
+
+TEST_CASE("test afe aec interface", "[afe]")
+{
+    int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
+
+    afe_aec_handle_t *handle = afe_aec_create("MNR", 4, AFE_TYPE_SR, AFE_MODE_HIGH_PERF);
+    int frame_bytes = handle->frame_size * sizeof(int16_t);
+    int16_t *indata = (int16_t *) malloc(frame_bytes*handle->pcm_config.total_ch_num);
+    int16_t *outdata = (int16_t *) malloc(frame_bytes);
+
+    afe_aec_process(handle, indata, outdata);
+    afe_aec_process(handle, indata, outdata);
+    afe_aec_process(handle, indata, outdata);
+
+    afe_aec_destroy(handle);
+    free(indata);
+    free(outdata);
+    int end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
+    TEST_ASSERT_EQUAL(true, end_size == start_size);
 }
--- a/test_apps/esp32c5/main/test_aec.cpp
+++ b/test_apps/esp32c5/main/test_aec.cpp
@ -12,10 +12,64 @@
 #include "freertos/FreeRTOS.h"
 #include "freertos/task.h"
 #include "esp_aec.h"
+#include "esp_afe_aec.h"
 #include "audio_test_file.h"
 #include "unity.h"
 #include "esp_timer.h"

+
+TEST_CASE("test esp32c5 afe aec interface", "[aec]")
+{
+    // vad_handle_t vad_handle = (vad_handle_t)arg;
+    heap_caps_print_heap_info(MALLOC_CAP_8BIT);
+    int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
+    int start_internal_size = heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
+    int sample_rate = 16000;
+
+    afe_aec_handle_t *aec_handle = afe_aec_create("MR", 2, AFE_TYPE_SR, AFE_MODE_LOW_COST);
+    afe_aec_destroy(aec_handle);
+    int first_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
+    printf("memory leak for first init: %d\n", start_size - first_end_size);
+
+    aec_handle = afe_aec_create("MR", 2, AFE_TYPE_SR, AFE_MODE_LOW_COST);
+    int audio_chunksize = afe_aec_get_chunksize(aec_handle);
+    printf("audio chunksize:%d\n", audio_chunksize); //512
+    int16_t *buffer = (int16_t *)malloc(audio_chunksize * sizeof(int16_t)*2);
+    int16_t *out_buffer = (int16_t *)malloc(audio_chunksize * sizeof(int16_t));
+
+    int chunks = 0;
+    uint32_t c0, c1, c_res = 0;
+    while (1) {
+        if ((chunks + 1)*audio_chunksize * sizeof(int16_t) <= sizeof(audio_mic_file)) {
+            memcpy(buffer, audio_mic_file + chunks * audio_chunksize , audio_chunksize * sizeof(int16_t));
+            memcpy(buffer+audio_chunksize, audio_ref_file + chunks * audio_chunksize  , audio_chunksize * sizeof(int16_t));            
+        } else {
+            break;
+        }
+        
+        c0 = esp_timer_get_time();
+        afe_aec_process(aec_handle, buffer, out_buffer);
+        c1 = esp_timer_get_time();
+
+        c_res += c1 - c0;
+        chunks++;
+    }
+
+    free(buffer);
+    free(out_buffer);
+    printf("RAM size after vad detection: total:%d, internal:%d\n", 
+            start_size - heap_caps_get_free_size(MALLOC_CAP_8BIT), 
+            start_internal_size - heap_caps_get_free_size(MALLOC_CAP_INTERNAL));
+    printf("Done! Took %ld ms to parse %d ms worth of samples in %d iterations.\n", 
+            c_res/1000, chunks*audio_chunksize*1000/sample_rate, chunks);
+    afe_aec_destroy(aec_handle);
+
+    int end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
+    printf("memory leak:%d\n", start_size-end_size);
+    TEST_ASSERT_EQUAL(true, end_size == start_size);
+}
+
+
 TEST_CASE("test esp32c5 aec", "[aec]")
 {
    // vad_handle_t vad_handle = (vad_handle_t)arg;
@ -68,3 +122,6 @@ TEST_CASE("test esp32c5 aec", "[aec]")
    printf("memory leak:%d\n", start_size-end_size);
    TEST_ASSERT_EQUAL(true, end_size == start_size);
 }
+
+
+