Merge branch 'feat/add_mfcc_interface' into 'master'

feat: add mfcc interface See merge request speech-recognition-framework/esp-sr!134
2025-09-15 15:28:44 +08:00 · 2025-02-07 11:37:41 +08:00 · 2025-02-07 11:37:41 +08:00 · 2993ce18ce
commit 2993ce18ce
parent b28d697f7c 7b3fa61bb3
41 changed files with 1234 additions and 498 deletions
--- a/include/esp32/c_speech_features_config.h
+++ b/include/esp32/c_speech_features_config.h
@ -0,0 +1,29 @@
+#pragma once
+#include <float.h>
+#include <math.h>
+
+/* #undef ENABLE_DOUBLE */
+
+#ifdef ENABLE_DOUBLE
+# define csf_float double
+# define csf_ceil ceil
+# define csf_floor floor
+# define csf_sin sin
+# define csf_log log
+# define csf_log10 log10
+# define csf_pow pow
+# define csf_sqrt sqrt
+# define csf_abs fabs
+# define csf_float_min DBL_MIN
+#else
+# define csf_float float
+# define csf_ceil ceilf
+# define csf_floor floorf
+# define csf_sin sinf
+# define csf_log logf
+# define csf_log10 log10f
+# define csf_pow powf
+# define csf_sqrt sqrtf
+# define csf_abs fabsf
+# define csf_float_min FLT_MIN
+#endif
--- a/include/esp32/esp_afe_config.h
+++ b/include/esp32/esp_afe_config.h
@ -1,241 +1,245 @@
 #pragma once
-#include "stdint.h"
-#include "stdbool.h"
-#include "stdlib.h"
-#include "esp_wn_iface.h"
-#include "esp_wn_models.h"
-#include "esp_vad.h"
 #include "esp_aec.h"
 #include "esp_agc.h"
-#include "model_path.h"
-#include "esp_vadn_models.h"
 #include "esp_nsn_models.h"
+#include "esp_vad.h"
+#include "esp_vadn_models.h"
+#include "esp_wn_iface.h"
+#include "esp_wn_models.h"
+#include "model_path.h"
+#include "stdbool.h"
+#include "stdint.h"
+#include "stdlib.h"
 #ifdef __cplusplus
 extern "C" {
 #endif

-//AFE: Audio Front-End 
-//SR:  Speech Recognition
-//VC:  Voice Communication
+// AFE: Audio Front-End
+// SR:  Speech Recognition
+// VC:  Voice Communication

-//Set AFE_SR mode
+// Set AFE_SR mode
 typedef enum {
-    SR_MODE_LOW_COST = 0,     //Deprecated, please use afe_mode_t, AFE mode: low cost mode
-    SR_MODE_HIGH_PERF = 1,    //Deprecated, please use afe_mode_t, AFE mode: high performance mode
+    SR_MODE_LOW_COST = 0,  // Deprecated, please use afe_mode_t, AFE mode: low cost mode
+    SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode
 } afe_sr_mode_t;

-//Set AFE mode
+// Set AFE mode
 typedef enum {
-    AFE_MODE_LOW_COST = 0,    // AFE mode: low cost mode
-    AFE_MODE_HIGH_PERF = 1,   // AFE mode: high performance mode
+    AFE_MODE_LOW_COST = 0,  // AFE mode: low cost mode
+    AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
 } afe_mode_t;

-//Set AFE type
+// Set AFE type
 typedef enum {
-    AFE_TYPE_SR = 0,   // Speech recognition scenarios, excluding nonlinear noise suppression
-    AFE_TYPE_VC = 1,   // Voice communication scenarios, including nonlinear noise suppression
+    AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
+    AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
 } afe_type_t;

 typedef enum {
-    AFE_MEMORY_ALLOC_MORE_INTERNAL = 1,             // malloc with more internal ram
-    AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2,    // malloc with internal ram and psram in balance
-    AFE_MEMORY_ALLOC_MORE_PSRAM = 3                 // malloc with more psram
+    AFE_MEMORY_ALLOC_MORE_INTERNAL = 1,          // malloc with more internal ram
+    AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
+    AFE_MEMORY_ALLOC_MORE_PSRAM = 3              // malloc with more psram
 } afe_memory_alloc_mode_t;

 typedef enum {
-    AFE_MN_PEAK_AGC_MODE_1 = -9,            // The peak amplitude of fetch audio is -9dB
-    AFE_MN_PEAK_AGC_MODE_2 = -6,            // The peak amplitude of fetch audio is -6dB
-    AFE_MN_PEAK_AGC_MODE_3 = -3,            // The peak amplitude of fetcg is -3dB
-    AFE_MN_PEAK_NO_AGC = 0,                 // There is no agc gain
+    AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
+    AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
+    AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
+    AFE_MN_PEAK_NO_AGC = 0,      // There is no agc gain
 } afe_mn_peak_agc_mode_t;

 typedef struct {
-    int total_ch_num;                        // total channel num, include microphone channel, playback channel and unknown channel
-    int mic_num;                             // microphone channel number
-    uint8_t* mic_ids;                        // microphone channel indices
-    int ref_num;                             // playback reference channel number
-    uint8_t* ref_ids;                        // playback reference channel indices
-    int sample_rate;                         // sample rate of audio
+    int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
+    int mic_num;      // microphone channel number
+    uint8_t *mic_ids; // microphone channel indices
+    int ref_num;      // playback reference channel number
+    uint8_t *ref_ids; // playback reference channel indices
+    int sample_rate;  // sample rate of audio
 } afe_pcm_config_t;

 typedef enum {
-    AFE_NS_MODE_WEBRTC = 0,                        // please use model name of NS, SSP: "WEBRTC"
-    AFE_NS_MODE_NET = 1,                        // please use model name of NSNET
+    AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
+    AFE_NS_MODE_NET = 1,    // please use model name of NSNET
 } afe_ns_mode_t;

 typedef enum {
-    AFE_AGC_MODE_WEBRTC = 0,                        // WEBRTC AGC
-    AFE_AGC_MODE_WAKENET = 1,                       // AGC gain is calculated by wakenet model if wakenet is activated
+    AFE_AGC_MODE_WEBRTC = 0,  // WEBRTC AGC
+    AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
 } afe_agc_mode_t;

 /**
 * @brief Function to get the debug audio data
 *
- * @param data        The debug audio data which don't be modify. It should be copied away as soon as possible that avoid blocking for too long.
+ * @param data        The debug audio data which don't be modify. It should be copied away as soon as possible that
+ * avoid blocking for too long.
 * @param data_size   The number of bytes of data.
 * @returns
 */
-typedef void (*afe_debug_hook_callback_t)(const int16_t* data, int data_size);
+typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);

 typedef enum {
-    AFE_DEBUG_HOOK_MASE_TASK_IN = 0,        // To get the input data of mase task
-    AFE_DEBUG_HOOK_FETCH_TASK_IN = 1,       // To get the input data of fetch task
+    AFE_DEBUG_HOOK_MASE_TASK_IN = 0,  // To get the input data of mase task
+    AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
    AFE_DEBUG_HOOK_MAX = 2
 } afe_debug_hook_type_t;

 typedef struct {
-    afe_debug_hook_type_t hook_type;            // debug type of hook
-    afe_debug_hook_callback_t hook_callback;    // callback function which transfer debug audio data
+    afe_debug_hook_type_t hook_type;         // debug type of hook
+    afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
 } afe_debug_hook_t;

 typedef struct {
    /********** AEC(Acoustic Echo Cancellation) **********/
-    bool aec_init;                          // Whether to init aec
-    aec_mode_t aec_mode;                    // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
-    int aec_filter_length;                  // The filter length of aec
+    bool aec_init;         // Whether to init aec
+    aec_mode_t aec_mode;   // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
+    int aec_filter_length; // The filter length of aec

    /********** SE(Speech Enhancement, microphone array processing) **********/
-    bool se_init;                           // Whether to init se
+    bool se_init; // Whether to init se

    /********** NS(Noise Suppression) **********/
-    bool ns_init;                           // Whether to init ns
-    char *ns_model_name;                    // Model name of ns
-    afe_ns_mode_t afe_ns_mode;              // Model mode of ns
-    
+    bool ns_init;              // Whether to init ns
+    char *ns_model_name;       // Model name of ns
+    afe_ns_mode_t afe_ns_mode; // Model mode of ns
+
    /********** VAD(Voice Activity Detection) **********/
-    bool vad_init;                           // Whether to init vad
-    vad_mode_t vad_mode;                     // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
-    char *vad_model_name;                    // The model name of vad, If it is null, WebRTC VAD will be used.
-    int vad_min_speech_ms;                   // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
-    int vad_min_noise_ms;                    // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms
-    bool vad_mute_playback;                  // If true, the playback will be muted for vad detection. default: false
-    bool vad_enable_channel_trigger;         // If true, the vad will be used to choose the channel id. default: false
+    bool vad_init;          // Whether to init vad
+    vad_mode_t vad_mode;    // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
+    char *vad_model_name;   // The model name of vad, If it is null, WebRTC VAD will be used.
+    int vad_min_speech_ms;  // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
+    int vad_min_noise_ms;   // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
+                            // 1000 ms
+    bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
+    bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false

    /********** WakeNet(Wake Word Engine) **********/
    bool wakenet_init;
-    char *wakenet_model_name;               // The model name of wakenet 1
-    char *wakenet_model_name_2;             // The model name of wakenet 2 if has wakenet 2
-    det_mode_t wakenet_mode;                // The mode of wakenet
+    char *wakenet_model_name;   // The model name of wakenet 1
+    char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
+    det_mode_t wakenet_mode;    // The mode of wakenet

    /********** AGC(Automatic Gain Control) **********/
-    bool agc_init;                           // Whether to init agc
-    afe_agc_mode_t agc_mode;                     // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
-    int agc_compression_gain_db;             // Compression gain in dB (default 9)
-    int agc_target_level_dbfs;               // Target level in -dBfs of envelope (default -3)
+    bool agc_init; // Whether to init agc
+    afe_agc_mode_t
+        agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
+    int agc_compression_gain_db; // Compression gain in dB (default 9)
+    int agc_target_level_dbfs;   // Target level in -dBfs of envelope (default -3)

    /********** General AFE(Audio Front End) parameter **********/
-    afe_pcm_config_t pcm_config;            // Config the channel num of original data which is fed to the afe feed function.
-    afe_mode_t afe_mode;                    // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
-    afe_type_t afe_type;                    // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
-    int afe_perferred_core;                 // The preferred core of afe se task, which is created in afe_create function.
-    int afe_perferred_priority;             // The preferred priority of afe se task, which is created in afe_create function.
-    int afe_ringbuf_size;                   // The ring buffer size: the number of frame data in ring buffer.
-    afe_memory_alloc_mode_t memory_alloc_mode;  // The memory alloc mode for afe. From Internal RAM or PSRAM
-    float afe_linear_gain;                  // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude.
+    afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
+    afe_mode_t afe_mode;         // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    afe_type_t afe_type;         // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    int afe_perferred_core;      // The preferred core of afe se task, which is created in afe_create function.
+    int afe_perferred_priority;  // The preferred priority of afe se task, which is created in afe_create function.
+    int afe_ringbuf_size;        // The ring buffer size: the number of frame data in ring buffer.
+    afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
+    float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
+                           // directly on the output amplitude: out_linear_gain * amplitude.
    bool debug_init;
-    bool fixed_first_channel;                // If true, the channel after first wake-up is fixed to raw data of microphone
-                                             // otherwise, select channel number by wakenet
+    bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
+                              // otherwise, select channel number by wakenet
 } afe_config_t;

 /**
- * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format. 
- * You can manually fine-tune it after creating the configuration
- * 
- * The input format: 
+ * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
+ * on the chip target and input format. You can manually fine-tune it after creating the configuration
+ *
+ * The input format:
 * M to represent the microphone channel
 * R to represent the playback reference channel
 * N to represent an unknown or unused channel
- * 
- * For example, input_format="MMNR" indicates that the input data consists of four channels, 
+ *
+ * For example, input_format="MMNR" indicates that the input data consists of four channels,
 * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
- * 
+ *
 * @param input_format     The input format
 * @param models           Models from partition, which is configured by Kconfig
 * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
 * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
- * 
+ *
 * @return afe_config_t*  The default config of afe
 */
 afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);

 /**
 * @brief Check AFE configuration and make sure it is correct.
- * 
- * @warning If there is a configuration conflict, this function will modify some parameters. 
+ *
+ * @warning If there is a configuration conflict, this function will modify some parameters.
 * The guiding behind these modifications is to maintain the highest performance of the output audio and results.
 * And remove the conflict between different algorithms.
- * 
+ *
 * For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
 * If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
- * 
+ *
 * @param afe_config       Input AFE config
- * 
+ *
 * @return afe_config_t*  The modified AFE config
 */
 afe_config_t *afe_config_check(afe_config_t *afe_config);

 /**
 * @brief Parse input format
- * 
+ *
 * @param input_format The input format, same with afe_config_init() function
 * @param pcm_config   The pcm config
- * 
+ *
 * @return true if the input format is parsed successfully, otherwise false
 */
-bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config);
+bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);

 /**
 * @brief Parse I2S input data
- * 
+ *
 * @param data         The input multi channel data
 * @param frame_size   The frame size of input, it is also the size of single channel data
 * @param mic_data     The output microphone data
 * @param ref_data     The output playback reference data
 * @param pcm_config   The pcm config
- * 
+ *
 */
-void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config);
+void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);

 /**
 * @brief Parse input data, from interleaved arrangement to contiguous arrangement
- * 
+ *
 * @param data         The input multi channel data
 * @param frame_size   The frame size of input, it is also the size of single channel data
 * @param channel_num  The channel number of data
 * @param out_data     The output data
- * 
+ *
 */
-void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
+void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);

 /**
 * @brief Format input data, from contiguous arrangement to interleaved arrangement
- * 
+ *
 * @param data         The input multi channel data
 * @param frame_size   The frame size of input, it is also the size of single channel data
 * @param channel_num  The channel number of data
 * @param out_data     The output data
- * 
+ *
 */
-void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
+void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);

 /**
 * @brief Adjust the gain of input data
- * 
+ *
 * @warning the input data will be modified inplace.
- * 
+ *
 * @param data         The input audio data
 * @param frame_size   The frame size of input, it is also the size of single channel data
 * @param factor       The gain factor
- * 
+ *
 * @return int16_t*    The output audio data
 */
-int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
+int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);

 /**
 * @brief Adjust the gain of input data
- * 
+ *
 * @warning the input data will be modified inplace.
- * 
+ *
 * @param in_data         The input audio data
 * @param in_frame_size   Input data frame size of input
 * @param channel_num     The channel number of input data, which is same as output data
@ -243,35 +247,35 @@ int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
 * @param out_frame_size  Onput data frame size of input
 *
 */
-void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size);
+void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);

 /**
 * @brief Copy the afe config
- * 
+ *
 * @param dst_config    The destination afe config
 * @param src_config    The source afe config
- * 
+ *
 * @return   The destination afe config
 */
-afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
+afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);

 /**
 * @brief Print the afe config
- * 
+ *
 * @param afe_config    The afe config
 */
 void afe_config_print(const afe_config_t *afe_config);

 /**
 * @brief Allocate afe config
- * 
+ *
 * @return The afe config pointer
 */
 afe_config_t *afe_config_alloc();

 /**
 * @brief Free afe config
- * 
+ *
 * @param afe_config  The afe config pointer
 */
 void afe_config_free(afe_config_t *afe_config);
--- a/include/esp32/esp_afe_sr_iface.h
+++ b/include/esp32/esp_afe_sr_iface.h
@ -1,62 +1,61 @@
 #pragma once
+#include "esp_afe_config.h"
+#include "stdbool.h"
 #include "stdint.h"
 #include "stdlib.h"
-#include "stdbool.h"
-#include "esp_afe_config.h"
 #include "freertos/FreeRTOS.h"
 #include "freertos/task.h"
 #ifdef __cplusplus
 extern "C" {
 #endif

-//AFE: Audio Front-End 
-//SR:  Speech Recognition
-//afe_sr/AFE_SR: the audio front-end for speech recognition
+// AFE: Audio Front-End
+// SR:  Speech Recognition
+// afe_sr/AFE_SR: the audio front-end for speech recognition

-//Opaque AFE_SR data container
+// Opaque AFE_SR data container
 typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;

-
-
 /**
 * @brief The state of vad
 */
-typedef enum
-{
-    AFE_VAD_SILENCE = 0,                    // Deprecated, please use vad_state_t, noise or silence
-    AFE_VAD_SPEECH = 1                      // Deprecated, please use vad_state_t, speech
+typedef enum {
+    AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
+    AFE_VAD_SPEECH = 1   // Deprecated, please use vad_state_t, speech
 } afe_vad_state_t;

 /**
 * @brief The result of fetch function
 */
-typedef struct afe_fetch_result_t
-{
-    int16_t *data;                          // the target channel data of audio.
-    int data_size;                          // the size of data. The unit is byte.
-    int16_t *vad_cache;                     // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
-    int vad_cache_size;                     // the size of vad_cache. The unit is byte.
-    float data_volume;                      // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc).
-                                            // if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length. 
-    wakenet_state_t wakeup_state;           // the value is wakenet_state_t
-    int wake_word_index;                    // if the wake word is detected. It will store the wake word index which start from 1.
-    int wakenet_model_index;                // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
-    vad_state_t vad_state;              // the value is afe_vad_state_t
-    int trigger_channel_id;                 // the channel index of output
-    int wake_word_length;                   // the length of wake word. The unit is the number of samples.
-    int ret_value;                          // the return state of fetch function
-    int16_t *raw_data;                      // the multi-channel output data of audio.
-    int raw_data_channels;                  // the channel number of raw data
-    void* reserved;                         // reserved for future use
+typedef struct afe_fetch_result_t {
+    int16_t *data;      // the target channel data of audio.
+    int data_size;      // the size of data. The unit is byte.
+    int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
+                        // audio that was truncated.
+    int vad_cache_size; // the size of vad_cache. The unit is byte.
+    float data_volume;  // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
+                        // (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
+                        // wakenet(about 1.5s), otherwise is the frame length.
+    wakenet_state_t wakeup_state; // the value is wakenet_state_t
+    int wake_word_index;          // if the wake word is detected. It will store the wake word index which start from 1.
+    int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
+                             // start from 1.
+    vad_state_t vad_state;   // the value is afe_vad_state_t
+    int trigger_channel_id;  // the channel index of output
+    int wake_word_length;    // the length of wake word. The unit is the number of samples.
+    int ret_value;           // the return state of fetch function
+    int16_t *raw_data;       // the multi-channel output data of audio.
+    int raw_data_channels;   // the channel number of raw data
+    void *reserved;          // reserved for future use
 } afe_fetch_result_t;

 /**
 * @brief Function to initialze a AFE_SR instance
- * 
+ *
 * @param afe_config        The config of AFE_SR
 * @returns Handle to the AFE_SR data
 */
-typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
+typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);

 /**
 * @brief Get the amount of each channel samples per frame that need to be passed to the function
@ -71,7 +70,7 @@ typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);

 /**
 * @brief Get the channel number
- * 
+ *
 * @param afe   The AFE_SR object to query
 * @return      The amount of total channels
 */
@ -92,12 +91,12 @@ typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
 *           The last channel is reference signal if it has reference data.
 *
 * @param afe   The AFE_SR object to query
- * 
- * @param in    The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the 
+ *
+ * @param in    The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
 *              `get_feed_chunksize`.
 * @return      The size of input
 */
-typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in);
+typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);

 /**
 * @brief fetch enhanced samples of an audio stream from the AFE_SR
@ -106,9 +105,10 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t*
 *           Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
 *
 * @param afe   The AFE_SR object to query
- * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
+ * audio can be queried by the `get_fetch_chunksize`.)
 */
-typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
+typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);

 /**
 * @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
@ -117,9 +117,10 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *af
 *
 * @param afe            The AFE_SR object to query
 * @param ticks_to_wait  The timeout value, in ticks, to wait for the fetch result.
- * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
+ * audio can be queried by the `get_fetch_chunksize`.)
 */
-typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
+typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);

 /**
 * @brief reset ringbuf of AFE.
@ -130,14 +131,14 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr
 typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);

 /**
- * @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient 
+ * @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
 *        when wakenet has been initialized. It's only support wakenet 1 now.
 *
 * @param afe                The AFE_SR object to query
 * @param wakenet_word       The wakenet word, should be DEFAULT_WAKE_WORD or EXTRA_WAKE_WORD
 * @return             -1: fail, 1: success
 */
-typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name);
+typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name);

 /**
 * @brief Enable VAD algorithm.
@ -179,7 +180,6 @@ typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
 */
 typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);

-
 /**
 * This structure contains the functions used to do operations on a AFE_SR.
 */
@ -191,11 +191,11 @@ typedef struct {
    esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
    esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
    esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
-    esp_afe_sr_iface_op_get_channel_num_t get_channel_num;        // same with get_feed_channel_num
+    esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
    esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
    esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
    esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
-    esp_afe_sr_iface_op_set_wakenet_t  set_wakenet; 
+    esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
    esp_afe_sr_iface_op_disable_func_t disable_wakenet;
    esp_afe_sr_iface_op_enable_func_t enable_wakenet;
    esp_afe_sr_iface_op_disable_func_t disable_aec;
@ -212,16 +212,14 @@ typedef struct {
    esp_afe_sr_iface_op_destroy_t destroy;
 } esp_afe_sr_iface_t;

-
 // struct is used to store the AFE handle and data for the AFE task
-typedef struct 
-{
+typedef struct {
    esp_afe_sr_data_t *afe_data;
    esp_afe_sr_iface_t *afe_handle;
-    TaskHandle_t feed_task; 
-    TaskHandle_t fetch_task; 
-}afe_task_into_t;
+    TaskHandle_t feed_task;
+    TaskHandle_t fetch_task;
+} afe_task_into_t;

 #ifdef __cplusplus
 }
-#endif
+#endif
--- a/include/esp32/esp_afe_sr_models.h
+++ b/include/esp32/esp_afe_sr_models.h
@ -10,4 +10,4 @@ esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);

 #ifdef __cplusplus
 }
-#endif
+#endif
--- a/include/esp32/esp_mfcc_iface.h
+++ b/include/esp32/esp_mfcc_iface.h
@ -0,0 +1,89 @@
+#pragma once
+#include <stdint.h>
+#include "esp_speech_features.h"
+
+/*
+This describes an interface for a MFCC runner, that is, some kind of implementation that can be
+fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so
+multiple implementations can be used.
+*/
+
+
+typedef struct esp_mfcc_data_t esp_mfcc_data_t;
+
+
+//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please
+//refer to its documentation for details.
+typedef struct {
+    int winstep_ms;     // The step between successive windows in ms. (10)
+    int winlen_ms;      // The length of the analysis window in ms. (25)
+    int nch;            // The number of input channel
+    int numcep;         // The number of cepstrum to return
+    int nfilter;        // The number of filters in the filterbank
+    int nfft;           // The FFT size
+    int samp_freq;      // The sample-rate of the signal.
+    int low_freq;       // The lowest band edge of mel filters, in hz. (e.g. 0)
+    int high_freq;      // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
+    float preemph;      // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
+    char *win_type;     // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
+    bool append_energy; //　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
+    bool use_power;     // If true, use power of fft spectrum, else use magnitude of fft spectrum
+    int use_log_fbank;  // 0: return fbank, 1:  return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
+    float log_epsilon;  // log epsilon. (e.g. 1e-7) 
+    bool psram_first;   // Alloc memory from PSRAM first
+    bool remove_dc_offset;  // Whether to subtract mean of wave before FFT
+} esp_mfcc_opts_t;
+
+
+/**
+ * @brief Un-initialize and free a mfcc runner
+ *
+ * Function to free a previously allocated mfcc runner.
+ *
+ * @param r Runner object to destroy
+ */
+typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Initialize parameters for a mfcc runner.
+ *
+ * After creation, a mfcc runner needs to be initialized first; this is usually done
+ * in the initialization routine of a speech recognition algorithm. This provides
+ * a pointer to do this for a specific mfcc runner.
+ *
+ * @param opt Options for the mfcc process
+ * @return True if success, false on error.
+ */
+typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
+
+/**
+ * @brief Run a mfcc iteration on frame by frame
+ *
+ * This will take a set of samples and return a ceptrum. Note that this may be pipelined:
+ * an initial call to this function may return NULL and subsequent calls may return the 
+ * cepstrum of previous calls.
+ *
+ * @param r The mfcc runner
+ * @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000).
+ * @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function
+ *         when done with this buffer. Note that some implementations require the buffer to be freed before another call
+ *         to this function is done.
+ */
+typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
+
+/**
+ * @brief Clean all state of mfcc handle
+ *
+ * @param r The mfcc runner
+ */
+typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Operations possible on a mfcc runner
+ */
+typedef struct {
+    esp_mfcc_op_destroy_t destroy;
+    esp_mfcc_op_create_t create;
+    esp_mfcc_op_run_step_t run_step;
+    esp_mfcc_op_clean_t clean;
+} esp_mfcc_iface_t;
--- a/include/esp32/esp_mfcc_models.h
+++ b/include/esp32/esp_mfcc_models.h
@ -0,0 +1,40 @@
+#pragma once
+#include "esp_mfcc_iface.h"
+
+
+extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle
+
+
+/**
+ * @brief Return basic opts used in wakenet9 & multinet5
+ **/
+esp_mfcc_opts_t *get_mfcc_opts_wn9();
+
+/**
+ * @brief Return basic opts for default kaldifeat 
+ * 
+    opts->psram_first = true;
+    opts->use_power = true;
+    opts->use_log_fbank = 2;  // log(max(x, log_epsilon))
+    opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps
+    opts->win_type = "povey";
+    opts->low_freq = 20;
+    opts->high_freq = 7600;
+    opts->samp_freq = 16000;
+    opts->nch = 1;
+    opts->nfft = 512;
+    opts->nfilter = 80;
+    opts->numcep = 80;
+    opts->preemph = 0.97;
+    opts->append_energy = false;
+    opts->winlen_ms = 25;
+    opts->winstep_ms = 10;
+    opts->remove_dc_offset = true;
+ *
+ **/
+esp_mfcc_opts_t *get_mfcc_opts_kaldi();
+
+/**
+ * @brief Print mfcc opts
+ **/
+void print_mfcc_opts(esp_mfcc_opts_t *opts);
--- a/include/esp32/esp_speech_features.h
+++ b/include/esp32/esp_speech_features.h
@ -0,0 +1,64 @@
+#pragma once
+#include "c_speech_features_config.h"
+#include "stdlib.h"
+#include <assert.h>
+#include <stdbool.h>
+
+#ifndef M_2PI
+#define M_2PI 6.283185307179586476925286766559005
+#endif
+
+typedef struct 
+{
+    float *coeff;
+    int *bank_pos;
+    int nfilter;
+} esp_mel_filter_t;
+
+float* esp_mfcc_malloc(size_t size, bool from_psram);
+
+void esp_mfcc_free(void *ptr);
+
+/**
+ * @brief Initialize FFT table
+ * @warning For ESP-PLATFORM, use esp-dsp fft
+ *          For Other platform, use kiss fft  
+ * 
+ * @param nfft  The input samples number 
+ * @return fft-table
+ **/
+void* esp_fft_init(int nfft);
+
+/**
+ * @brief Free FFT table
+ * @warning For ESP-PLATFORM, use esp-dsp fft
+ *          For Other platform, use kiss fft  
+ * 
+ * @param fft_table  The fft table initialized by esp_fft_init
+ * @param nfft       The input samples number 
+ * @return fft-table
+ **/
+void esp_fft_deinit(void *fft_table, int nfft);
+
+/**
+ * @brief Initial window function
+ *        Currently support hanning, hamming, sine, povey, rectangular, 
+ *        wn9(512-hanning to get wakenet9& multinet5 compatible)
+ **/
+float *esp_win_func_init(char *win_type, float* window_data, int frame_length);
+
+float* esp_fftr(float* x, int nfft, void *fft_table);
+
+float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
+
+void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
+
+float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);
+
+esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, 
+                                      bool from_psram);
+
+void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);
+
+float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, 
+                                float epsilon);
--- a/include/esp32p4/c_speech_features_config.h
+++ b/include/esp32p4/c_speech_features_config.h
@ -0,0 +1,29 @@
+#pragma once
+#include <float.h>
+#include <math.h>
+
+/* #undef ENABLE_DOUBLE */
+
+#ifdef ENABLE_DOUBLE
+# define csf_float double
+# define csf_ceil ceil
+# define csf_floor floor
+# define csf_sin sin
+# define csf_log log
+# define csf_log10 log10
+# define csf_pow pow
+# define csf_sqrt sqrt
+# define csf_abs fabs
+# define csf_float_min DBL_MIN
+#else
+# define csf_float float
+# define csf_ceil ceilf
+# define csf_floor floorf
+# define csf_sin sinf
+# define csf_log logf
+# define csf_log10 log10f
+# define csf_pow powf
+# define csf_sqrt sqrtf
+# define csf_abs fabsf
+# define csf_float_min FLT_MIN
+#endif
--- a/include/esp32p4/esp_afe_config.h
+++ b/include/esp32p4/esp_afe_config.h
@ -1,241 +1,245 @@
 #pragma once
-#include "stdint.h"
-#include "stdbool.h"
-#include "stdlib.h"
-#include "esp_wn_iface.h"
-#include "esp_wn_models.h"
-#include "esp_vad.h"
 #include "esp_aec.h"
 #include "esp_agc.h"
-#include "model_path.h"
-#include "esp_vadn_models.h"
 #include "esp_nsn_models.h"
+#include "esp_vad.h"
+#include "esp_vadn_models.h"
+#include "esp_wn_iface.h"
+#include "esp_wn_models.h"
+#include "model_path.h"
+#include "stdbool.h"
+#include "stdint.h"
+#include "stdlib.h"
 #ifdef __cplusplus
 extern "C" {
 #endif

-//AFE: Audio Front-End 
-//SR:  Speech Recognition
-//VC:  Voice Communication
+// AFE: Audio Front-End
+// SR:  Speech Recognition
+// VC:  Voice Communication

-//Set AFE_SR mode
+// Set AFE_SR mode
 typedef enum {
-    SR_MODE_LOW_COST = 0,     //Deprecated, please use afe_mode_t, AFE mode: low cost mode
-    SR_MODE_HIGH_PERF = 1,    //Deprecated, please use afe_mode_t, AFE mode: high performance mode
+    SR_MODE_LOW_COST = 0,  // Deprecated, please use afe_mode_t, AFE mode: low cost mode
+    SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode
 } afe_sr_mode_t;

-//Set AFE mode
+// Set AFE mode
 typedef enum {
-    AFE_MODE_LOW_COST = 0,    // AFE mode: low cost mode
-    AFE_MODE_HIGH_PERF = 1,   // AFE mode: high performance mode
+    AFE_MODE_LOW_COST = 0,  // AFE mode: low cost mode
+    AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
 } afe_mode_t;

-//Set AFE type
+// Set AFE type
 typedef enum {
-    AFE_TYPE_SR = 0,   // Speech recognition scenarios, excluding nonlinear noise suppression
-    AFE_TYPE_VC = 1,   // Voice communication scenarios, including nonlinear noise suppression
+    AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
+    AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
 } afe_type_t;

 typedef enum {
-    AFE_MEMORY_ALLOC_MORE_INTERNAL = 1,             // malloc with more internal ram
-    AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2,    // malloc with internal ram and psram in balance
-    AFE_MEMORY_ALLOC_MORE_PSRAM = 3                 // malloc with more psram
+    AFE_MEMORY_ALLOC_MORE_INTERNAL = 1,          // malloc with more internal ram
+    AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
+    AFE_MEMORY_ALLOC_MORE_PSRAM = 3              // malloc with more psram
 } afe_memory_alloc_mode_t;

 typedef enum {
-    AFE_MN_PEAK_AGC_MODE_1 = -9,            // The peak amplitude of fetch audio is -9dB
-    AFE_MN_PEAK_AGC_MODE_2 = -6,            // The peak amplitude of fetch audio is -6dB
-    AFE_MN_PEAK_AGC_MODE_3 = -3,            // The peak amplitude of fetcg is -3dB
-    AFE_MN_PEAK_NO_AGC = 0,                 // There is no agc gain
+    AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
+    AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
+    AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
+    AFE_MN_PEAK_NO_AGC = 0,      // There is no agc gain
 } afe_mn_peak_agc_mode_t;

 typedef struct {
-    int total_ch_num;                        // total channel num, include microphone channel, playback channel and unknown channel
-    int mic_num;                             // microphone channel number
-    uint8_t* mic_ids;                        // microphone channel indices
-    int ref_num;                             // playback reference channel number
-    uint8_t* ref_ids;                        // playback reference channel indices
-    int sample_rate;                         // sample rate of audio
+    int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
+    int mic_num;      // microphone channel number
+    uint8_t *mic_ids; // microphone channel indices
+    int ref_num;      // playback reference channel number
+    uint8_t *ref_ids; // playback reference channel indices
+    int sample_rate;  // sample rate of audio
 } afe_pcm_config_t;

 typedef enum {
-    AFE_NS_MODE_WEBRTC = 0,                        // please use model name of NS, SSP: "WEBRTC"
-    AFE_NS_MODE_NET = 1,                        // please use model name of NSNET
+    AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
+    AFE_NS_MODE_NET = 1,    // please use model name of NSNET
 } afe_ns_mode_t;

 typedef enum {
-    AFE_AGC_MODE_WEBRTC = 0,                        // WEBRTC AGC
-    AFE_AGC_MODE_WAKENET = 1,                       // AGC gain is calculated by wakenet model if wakenet is activated
+    AFE_AGC_MODE_WEBRTC = 0,  // WEBRTC AGC
+    AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
 } afe_agc_mode_t;

 /**
 * @brief Function to get the debug audio data
 *
- * @param data        The debug audio data which don't be modify. It should be copied away as soon as possible that avoid blocking for too long.
+ * @param data        The debug audio data which don't be modify. It should be copied away as soon as possible that
+ * avoid blocking for too long.
 * @param data_size   The number of bytes of data.
 * @returns
 */
-typedef void (*afe_debug_hook_callback_t)(const int16_t* data, int data_size);
+typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);

 typedef enum {
-    AFE_DEBUG_HOOK_MASE_TASK_IN = 0,        // To get the input data of mase task
-    AFE_DEBUG_HOOK_FETCH_TASK_IN = 1,       // To get the input data of fetch task
+    AFE_DEBUG_HOOK_MASE_TASK_IN = 0,  // To get the input data of mase task
+    AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
    AFE_DEBUG_HOOK_MAX = 2
 } afe_debug_hook_type_t;

 typedef struct {
-    afe_debug_hook_type_t hook_type;            // debug type of hook
-    afe_debug_hook_callback_t hook_callback;    // callback function which transfer debug audio data
+    afe_debug_hook_type_t hook_type;         // debug type of hook
+    afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
 } afe_debug_hook_t;

 typedef struct {
    /********** AEC(Acoustic Echo Cancellation) **********/
-    bool aec_init;                          // Whether to init aec
-    aec_mode_t aec_mode;                    // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
-    int aec_filter_length;                  // The filter length of aec
+    bool aec_init;         // Whether to init aec
+    aec_mode_t aec_mode;   // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
+    int aec_filter_length; // The filter length of aec

    /********** SE(Speech Enhancement, microphone array processing) **********/
-    bool se_init;                           // Whether to init se
+    bool se_init; // Whether to init se

    /********** NS(Noise Suppression) **********/
-    bool ns_init;                           // Whether to init ns
-    char *ns_model_name;                    // Model name of ns
-    afe_ns_mode_t afe_ns_mode;              // Model mode of ns
-    
+    bool ns_init;              // Whether to init ns
+    char *ns_model_name;       // Model name of ns
+    afe_ns_mode_t afe_ns_mode; // Model mode of ns
+
    /********** VAD(Voice Activity Detection) **********/
-    bool vad_init;                           // Whether to init vad
-    vad_mode_t vad_mode;                     // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
-    char *vad_model_name;                    // The model name of vad, If it is null, WebRTC VAD will be used.
-    int vad_min_speech_ms;                   // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
-    int vad_min_noise_ms;                    // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms
-    bool vad_mute_playback;                  // If true, the playback will be muted for vad detection. default: false
-    bool vad_enable_channel_trigger;         // If true, the vad will be used to choose the channel id. default: false
+    bool vad_init;          // Whether to init vad
+    vad_mode_t vad_mode;    // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
+    char *vad_model_name;   // The model name of vad, If it is null, WebRTC VAD will be used.
+    int vad_min_speech_ms;  // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
+    int vad_min_noise_ms;   // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
+                            // 1000 ms
+    bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
+    bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false

    /********** WakeNet(Wake Word Engine) **********/
    bool wakenet_init;
-    char *wakenet_model_name;               // The model name of wakenet 1
-    char *wakenet_model_name_2;             // The model name of wakenet 2 if has wakenet 2
-    det_mode_t wakenet_mode;                // The mode of wakenet
+    char *wakenet_model_name;   // The model name of wakenet 1
+    char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
+    det_mode_t wakenet_mode;    // The mode of wakenet

    /********** AGC(Automatic Gain Control) **********/
-    bool agc_init;                           // Whether to init agc
-    afe_agc_mode_t agc_mode;                     // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
-    int agc_compression_gain_db;             // Compression gain in dB (default 9)
-    int agc_target_level_dbfs;               // Target level in -dBfs of envelope (default -3)
+    bool agc_init; // Whether to init agc
+    afe_agc_mode_t
+        agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
+    int agc_compression_gain_db; // Compression gain in dB (default 9)
+    int agc_target_level_dbfs;   // Target level in -dBfs of envelope (default -3)

    /********** General AFE(Audio Front End) parameter **********/
-    afe_pcm_config_t pcm_config;            // Config the channel num of original data which is fed to the afe feed function.
-    afe_mode_t afe_mode;                    // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
-    afe_type_t afe_type;                    // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
-    int afe_perferred_core;                 // The preferred core of afe se task, which is created in afe_create function.
-    int afe_perferred_priority;             // The preferred priority of afe se task, which is created in afe_create function.
-    int afe_ringbuf_size;                   // The ring buffer size: the number of frame data in ring buffer.
-    afe_memory_alloc_mode_t memory_alloc_mode;  // The memory alloc mode for afe. From Internal RAM or PSRAM
-    float afe_linear_gain;                  // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude.
+    afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
+    afe_mode_t afe_mode;         // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    afe_type_t afe_type;         // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    int afe_perferred_core;      // The preferred core of afe se task, which is created in afe_create function.
+    int afe_perferred_priority;  // The preferred priority of afe se task, which is created in afe_create function.
+    int afe_ringbuf_size;        // The ring buffer size: the number of frame data in ring buffer.
+    afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
+    float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
+                           // directly on the output amplitude: out_linear_gain * amplitude.
    bool debug_init;
-    bool fixed_first_channel;                // If true, the channel after first wake-up is fixed to raw data of microphone
-                                             // otherwise, select channel number by wakenet
+    bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
+                              // otherwise, select channel number by wakenet
 } afe_config_t;

 /**
- * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format. 
- * You can manually fine-tune it after creating the configuration
- * 
- * The input format: 
+ * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
+ * on the chip target and input format. You can manually fine-tune it after creating the configuration
+ *
+ * The input format:
 * M to represent the microphone channel
 * R to represent the playback reference channel
 * N to represent an unknown or unused channel
- * 
- * For example, input_format="MMNR" indicates that the input data consists of four channels, 
+ *
+ * For example, input_format="MMNR" indicates that the input data consists of four channels,
 * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
- * 
+ *
 * @param input_format     The input format
 * @param models           Models from partition, which is configured by Kconfig
 * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
 * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
- * 
+ *
 * @return afe_config_t*  The default config of afe
 */
 afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);

 /**
 * @brief Check AFE configuration and make sure it is correct.
- * 
- * @warning If there is a configuration conflict, this function will modify some parameters. 
+ *
+ * @warning If there is a configuration conflict, this function will modify some parameters.
 * The guiding behind these modifications is to maintain the highest performance of the output audio and results.
 * And remove the conflict between different algorithms.
- * 
+ *
 * For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
 * If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
- * 
+ *
 * @param afe_config       Input AFE config
- * 
+ *
 * @return afe_config_t*  The modified AFE config
 */
 afe_config_t *afe_config_check(afe_config_t *afe_config);

 /**
 * @brief Parse input format
- * 
+ *
 * @param input_format The input format, same with afe_config_init() function
 * @param pcm_config   The pcm config
- * 
+ *
 * @return true if the input format is parsed successfully, otherwise false
 */
-bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config);
+bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);

 /**
 * @brief Parse I2S input data
- * 
+ *
 * @param data         The input multi channel data
 * @param frame_size   The frame size of input, it is also the size of single channel data
 * @param mic_data     The output microphone data
 * @param ref_data     The output playback reference data
 * @param pcm_config   The pcm config
- * 
+ *
 */
-void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config);
+void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);

 /**
 * @brief Parse input data, from interleaved arrangement to contiguous arrangement
- * 
+ *
 * @param data         The input multi channel data
 * @param frame_size   The frame size of input, it is also the size of single channel data
 * @param channel_num  The channel number of data
 * @param out_data     The output data
- * 
+ *
 */
-void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
+void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);

 /**
 * @brief Format input data, from contiguous arrangement to interleaved arrangement
- * 
+ *
 * @param data         The input multi channel data
 * @param frame_size   The frame size of input, it is also the size of single channel data
 * @param channel_num  The channel number of data
 * @param out_data     The output data
- * 
+ *
 */
-void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
+void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);

 /**
 * @brief Adjust the gain of input data
- * 
+ *
 * @warning the input data will be modified inplace.
- * 
+ *
 * @param data         The input audio data
 * @param frame_size   The frame size of input, it is also the size of single channel data
 * @param factor       The gain factor
- * 
+ *
 * @return int16_t*    The output audio data
 */
-int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
+int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);

 /**
 * @brief Adjust the gain of input data
- * 
+ *
 * @warning the input data will be modified inplace.
- * 
+ *
 * @param in_data         The input audio data
 * @param in_frame_size   Input data frame size of input
 * @param channel_num     The channel number of input data, which is same as output data
@ -243,35 +247,35 @@ int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
 * @param out_frame_size  Onput data frame size of input
 *
 */
-void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size);
+void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);

 /**
 * @brief Copy the afe config
- * 
+ *
 * @param dst_config    The destination afe config
 * @param src_config    The source afe config
- * 
+ *
 * @return   The destination afe config
 */
-afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
+afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);

 /**
 * @brief Print the afe config
- * 
+ *
 * @param afe_config    The afe config
 */
 void afe_config_print(const afe_config_t *afe_config);

 /**
 * @brief Allocate afe config
- * 
+ *
 * @return The afe config pointer
 */
 afe_config_t *afe_config_alloc();

 /**
 * @brief Free afe config
- * 
+ *
 * @param afe_config  The afe config pointer
 */
 void afe_config_free(afe_config_t *afe_config);
--- a/include/esp32p4/esp_afe_sr_iface.h
+++ b/include/esp32p4/esp_afe_sr_iface.h
@ -1,62 +1,61 @@
 #pragma once
+#include "esp_afe_config.h"
+#include "stdbool.h"
 #include "stdint.h"
 #include "stdlib.h"
-#include "stdbool.h"
-#include "esp_afe_config.h"
 #include "freertos/FreeRTOS.h"
 #include "freertos/task.h"
 #ifdef __cplusplus
 extern "C" {
 #endif

-//AFE: Audio Front-End 
-//SR:  Speech Recognition
-//afe_sr/AFE_SR: the audio front-end for speech recognition
+// AFE: Audio Front-End
+// SR:  Speech Recognition
+// afe_sr/AFE_SR: the audio front-end for speech recognition

-//Opaque AFE_SR data container
+// Opaque AFE_SR data container
 typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;

-
-
 /**
 * @brief The state of vad
 */
-typedef enum
-{
-    AFE_VAD_SILENCE = 0,                    // Deprecated, please use vad_state_t, noise or silence
-    AFE_VAD_SPEECH = 1                      // Deprecated, please use vad_state_t, speech
+typedef enum {
+    AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
+    AFE_VAD_SPEECH = 1   // Deprecated, please use vad_state_t, speech
 } afe_vad_state_t;

 /**
 * @brief The result of fetch function
 */
-typedef struct afe_fetch_result_t
-{
-    int16_t *data;                          // the target channel data of audio.
-    int data_size;                          // the size of data. The unit is byte.
-    int16_t *vad_cache;                     // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
-    int vad_cache_size;                     // the size of vad_cache. The unit is byte.
-    float data_volume;                      // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc).
-                                            // if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length. 
-    wakenet_state_t wakeup_state;           // the value is wakenet_state_t
-    int wake_word_index;                    // if the wake word is detected. It will store the wake word index which start from 1.
-    int wakenet_model_index;                // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
-    vad_state_t vad_state;              // the value is afe_vad_state_t
-    int trigger_channel_id;                 // the channel index of output
-    int wake_word_length;                   // the length of wake word. The unit is the number of samples.
-    int ret_value;                          // the return state of fetch function
-    int16_t *raw_data;                      // the multi-channel output data of audio.
-    int raw_data_channels;                  // the channel number of raw data
-    void* reserved;                         // reserved for future use
+typedef struct afe_fetch_result_t {
+    int16_t *data;      // the target channel data of audio.
+    int data_size;      // the size of data. The unit is byte.
+    int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
+                        // audio that was truncated.
+    int vad_cache_size; // the size of vad_cache. The unit is byte.
+    float data_volume;  // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
+                        // (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
+                        // wakenet(about 1.5s), otherwise is the frame length.
+    wakenet_state_t wakeup_state; // the value is wakenet_state_t
+    int wake_word_index;          // if the wake word is detected. It will store the wake word index which start from 1.
+    int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
+                             // start from 1.
+    vad_state_t vad_state;   // the value is afe_vad_state_t
+    int trigger_channel_id;  // the channel index of output
+    int wake_word_length;    // the length of wake word. The unit is the number of samples.
+    int ret_value;           // the return state of fetch function
+    int16_t *raw_data;       // the multi-channel output data of audio.
+    int raw_data_channels;   // the channel number of raw data
+    void *reserved;          // reserved for future use
 } afe_fetch_result_t;

 /**
 * @brief Function to initialze a AFE_SR instance
- * 
+ *
 * @param afe_config        The config of AFE_SR
 * @returns Handle to the AFE_SR data
 */
-typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
+typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);

 /**
 * @brief Get the amount of each channel samples per frame that need to be passed to the function
@ -71,7 +70,7 @@ typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);

 /**
 * @brief Get the channel number
- * 
+ *
 * @param afe   The AFE_SR object to query
 * @return      The amount of total channels
 */
@ -92,12 +91,12 @@ typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
 *           The last channel is reference signal if it has reference data.
 *
 * @param afe   The AFE_SR object to query
- * 
- * @param in    The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the 
+ *
+ * @param in    The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
 *              `get_feed_chunksize`.
 * @return      The size of input
 */
-typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in);
+typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);

 /**
 * @brief fetch enhanced samples of an audio stream from the AFE_SR
@ -106,9 +105,10 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t*
 *           Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
 *
 * @param afe   The AFE_SR object to query
- * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
+ * audio can be queried by the `get_fetch_chunksize`.)
 */
-typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
+typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);

 /**
 * @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
@ -117,9 +117,10 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *af
 *
 * @param afe            The AFE_SR object to query
 * @param ticks_to_wait  The timeout value, in ticks, to wait for the fetch result.
- * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
+ * audio can be queried by the `get_fetch_chunksize`.)
 */
-typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
+typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);

 /**
 * @brief reset ringbuf of AFE.
@ -130,14 +131,14 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr
 typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);

 /**
- * @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient 
+ * @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
 *        when wakenet has been initialized. It's only support wakenet 1 now.
 *
 * @param afe                The AFE_SR object to query
 * @param wakenet_word       The wakenet word, should be DEFAULT_WAKE_WORD or EXTRA_WAKE_WORD
 * @return             -1: fail, 1: success
 */
-typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name);
+typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name);

 /**
 * @brief Enable VAD algorithm.
@ -179,7 +180,6 @@ typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
 */
 typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);

-
 /**
 * This structure contains the functions used to do operations on a AFE_SR.
 */
@ -191,11 +191,11 @@ typedef struct {
    esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
    esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
    esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
-    esp_afe_sr_iface_op_get_channel_num_t get_channel_num;        // same with get_feed_channel_num
+    esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
    esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
    esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
    esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
-    esp_afe_sr_iface_op_set_wakenet_t  set_wakenet; 
+    esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
    esp_afe_sr_iface_op_disable_func_t disable_wakenet;
    esp_afe_sr_iface_op_enable_func_t enable_wakenet;
    esp_afe_sr_iface_op_disable_func_t disable_aec;
@ -212,16 +212,14 @@ typedef struct {
    esp_afe_sr_iface_op_destroy_t destroy;
 } esp_afe_sr_iface_t;

-
 // struct is used to store the AFE handle and data for the AFE task
-typedef struct 
-{
+typedef struct {
    esp_afe_sr_data_t *afe_data;
    esp_afe_sr_iface_t *afe_handle;
-    TaskHandle_t feed_task; 
-    TaskHandle_t fetch_task; 
-}afe_task_into_t;
+    TaskHandle_t feed_task;
+    TaskHandle_t fetch_task;
+} afe_task_into_t;

 #ifdef __cplusplus
 }
-#endif
+#endif
--- a/include/esp32p4/esp_afe_sr_models.h
+++ b/include/esp32p4/esp_afe_sr_models.h
@ -10,4 +10,4 @@ esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);

 #ifdef __cplusplus
 }
-#endif
+#endif
--- a/include/esp32p4/esp_mfcc_iface.h
+++ b/include/esp32p4/esp_mfcc_iface.h
@ -0,0 +1,89 @@
+#pragma once
+#include <stdint.h>
+#include "esp_speech_features.h"
+
+/*
+This describes an interface for a MFCC runner, that is, some kind of implementation that can be
+fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so
+multiple implementations can be used.
+*/
+
+
+typedef struct esp_mfcc_data_t esp_mfcc_data_t;
+
+
+//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please
+//refer to its documentation for details.
+typedef struct {
+    int winstep_ms;     // The step between successive windows in ms. (10)
+    int winlen_ms;      // The length of the analysis window in ms. (25)
+    int nch;            // The number of input channel
+    int numcep;         // The number of cepstrum to return
+    int nfilter;        // The number of filters in the filterbank
+    int nfft;           // The FFT size
+    int samp_freq;      // The sample-rate of the signal.
+    int low_freq;       // The lowest band edge of mel filters, in hz. (e.g. 0)
+    int high_freq;      // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
+    float preemph;      // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
+    char *win_type;     // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
+    bool append_energy; //　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
+    bool use_power;     // If true, use power of fft spectrum, else use magnitude of fft spectrum
+    int use_log_fbank;  // 0: return fbank, 1:  return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
+    float log_epsilon;  // log epsilon. (e.g. 1e-7) 
+    bool psram_first;   // Alloc memory from PSRAM first
+    bool remove_dc_offset;  // Whether to subtract mean of wave before FFT
+} esp_mfcc_opts_t;
+
+
+/**
+ * @brief Un-initialize and free a mfcc runner
+ *
+ * Function to free a previously allocated mfcc runner.
+ *
+ * @param r Runner object to destroy
+ */
+typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Initialize parameters for a mfcc runner.
+ *
+ * After creation, a mfcc runner needs to be initialized first; this is usually done
+ * in the initialization routine of a speech recognition algorithm. This provides
+ * a pointer to do this for a specific mfcc runner.
+ *
+ * @param opt Options for the mfcc process
+ * @return True if success, false on error.
+ */
+typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
+
+/**
+ * @brief Run a mfcc iteration on frame by frame
+ *
+ * This will take a set of samples and return a ceptrum. Note that this may be pipelined:
+ * an initial call to this function may return NULL and subsequent calls may return the 
+ * cepstrum of previous calls.
+ *
+ * @param r The mfcc runner
+ * @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000).
+ * @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function
+ *         when done with this buffer. Note that some implementations require the buffer to be freed before another call
+ *         to this function is done.
+ */
+typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
+
+/**
+ * @brief Clean all state of mfcc handle
+ *
+ * @param r The mfcc runner
+ */
+typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Operations possible on a mfcc runner
+ */
+typedef struct {
+    esp_mfcc_op_destroy_t destroy;
+    esp_mfcc_op_create_t create;
+    esp_mfcc_op_run_step_t run_step;
+    esp_mfcc_op_clean_t clean;
+} esp_mfcc_iface_t;
--- a/include/esp32p4/esp_mfcc_models.h
+++ b/include/esp32p4/esp_mfcc_models.h
@ -0,0 +1,40 @@
+#pragma once
+#include "esp_mfcc_iface.h"
+
+
+extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle
+
+
+/**
+ * @brief Return basic opts used in wakenet9 & multinet5
+ **/
+esp_mfcc_opts_t *get_mfcc_opts_wn9();
+
+/**
+ * @brief Return basic opts for default kaldifeat 
+ * 
+    opts->psram_first = true;
+    opts->use_power = true;
+    opts->use_log_fbank = 2;  // log(max(x, log_epsilon))
+    opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps
+    opts->win_type = "povey";
+    opts->low_freq = 20;
+    opts->high_freq = 7600;
+    opts->samp_freq = 16000;
+    opts->nch = 1;
+    opts->nfft = 512;
+    opts->nfilter = 80;
+    opts->numcep = 80;
+    opts->preemph = 0.97;
+    opts->append_energy = false;
+    opts->winlen_ms = 25;
+    opts->winstep_ms = 10;
+    opts->remove_dc_offset = true;
+ *
+ **/
+esp_mfcc_opts_t *get_mfcc_opts_kaldi();
+
+/**
+ * @brief Print mfcc opts
+ **/
+void print_mfcc_opts(esp_mfcc_opts_t *opts);
--- a/include/esp32p4/esp_speech_features.h
+++ b/include/esp32p4/esp_speech_features.h
@ -0,0 +1,64 @@
+#pragma once
+#include "c_speech_features_config.h"
+#include "stdlib.h"
+#include <assert.h>
+#include <stdbool.h>
+
+#ifndef M_2PI
+#define M_2PI 6.283185307179586476925286766559005
+#endif
+
+typedef struct 
+{
+    float *coeff;
+    int *bank_pos;
+    int nfilter;
+} esp_mel_filter_t;
+
+float* esp_mfcc_malloc(size_t size, bool from_psram);
+
+void esp_mfcc_free(void *ptr);
+
+/**
+ * @brief Initialize FFT table
+ * @warning For ESP-PLATFORM, use esp-dsp fft
+ *          For Other platform, use kiss fft  
+ * 
+ * @param nfft  The input samples number 
+ * @return fft-table
+ **/
+void* esp_fft_init(int nfft);
+
+/**
+ * @brief Free FFT table
+ * @warning For ESP-PLATFORM, use esp-dsp fft
+ *          For Other platform, use kiss fft  
+ * 
+ * @param fft_table  The fft table initialized by esp_fft_init
+ * @param nfft       The input samples number 
+ * @return fft-table
+ **/
+void esp_fft_deinit(void *fft_table, int nfft);
+
+/**
+ * @brief Initial window function
+ *        Currently support hanning, hamming, sine, povey, rectangular, 
+ *        wn9(512-hanning to get wakenet9& multinet5 compatible)
+ **/
+float *esp_win_func_init(char *win_type, float* window_data, int frame_length);
+
+float* esp_fftr(float* x, int nfft, void *fft_table);
+
+float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
+
+void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
+
+float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);
+
+esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, 
+                                      bool from_psram);
+
+void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);
+
+float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, 
+                                float epsilon);
--- a/include/esp32s3/c_speech_features_config.h
+++ b/include/esp32s3/c_speech_features_config.h
@ -0,0 +1,29 @@
+#pragma once
+#include <float.h>
+#include <math.h>
+
+/* #undef ENABLE_DOUBLE */
+
+#ifdef ENABLE_DOUBLE
+# define csf_float double
+# define csf_ceil ceil
+# define csf_floor floor
+# define csf_sin sin
+# define csf_log log
+# define csf_log10 log10
+# define csf_pow pow
+# define csf_sqrt sqrt
+# define csf_abs fabs
+# define csf_float_min DBL_MIN
+#else
+# define csf_float float
+# define csf_ceil ceilf
+# define csf_floor floorf
+# define csf_sin sinf
+# define csf_log logf
+# define csf_log10 log10f
+# define csf_pow powf
+# define csf_sqrt sqrtf
+# define csf_abs fabsf
+# define csf_float_min FLT_MIN
+#endif
--- a/include/esp32s3/esp_afe_config.h
+++ b/include/esp32s3/esp_afe_config.h
@ -1,241 +1,245 @@
 #pragma once
-#include "stdint.h"
-#include "stdbool.h"
-#include "stdlib.h"
-#include "esp_wn_iface.h"
-#include "esp_wn_models.h"
-#include "esp_vad.h"
 #include "esp_aec.h"
 #include "esp_agc.h"
-#include "model_path.h"
-#include "esp_vadn_models.h"
 #include "esp_nsn_models.h"
+#include "esp_vad.h"
+#include "esp_vadn_models.h"
+#include "esp_wn_iface.h"
+#include "esp_wn_models.h"
+#include "model_path.h"
+#include "stdbool.h"
+#include "stdint.h"
+#include "stdlib.h"
 #ifdef __cplusplus
 extern "C" {
 #endif

-//AFE: Audio Front-End 
-//SR:  Speech Recognition
-//VC:  Voice Communication
+// AFE: Audio Front-End
+// SR:  Speech Recognition
+// VC:  Voice Communication

-//Set AFE_SR mode
+// Set AFE_SR mode
 typedef enum {
-    SR_MODE_LOW_COST = 0,     //Deprecated, please use afe_mode_t, AFE mode: low cost mode
-    SR_MODE_HIGH_PERF = 1,    //Deprecated, please use afe_mode_t, AFE mode: high performance mode
+    SR_MODE_LOW_COST = 0,  // Deprecated, please use afe_mode_t, AFE mode: low cost mode
+    SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode
 } afe_sr_mode_t;

-//Set AFE mode
+// Set AFE mode
 typedef enum {
-    AFE_MODE_LOW_COST = 0,    // AFE mode: low cost mode
-    AFE_MODE_HIGH_PERF = 1,   // AFE mode: high performance mode
+    AFE_MODE_LOW_COST = 0,  // AFE mode: low cost mode
+    AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
 } afe_mode_t;

-//Set AFE type
+// Set AFE type
 typedef enum {
-    AFE_TYPE_SR = 0,   // Speech recognition scenarios, excluding nonlinear noise suppression
-    AFE_TYPE_VC = 1,   // Voice communication scenarios, including nonlinear noise suppression
+    AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
+    AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
 } afe_type_t;

 typedef enum {
-    AFE_MEMORY_ALLOC_MORE_INTERNAL = 1,             // malloc with more internal ram
-    AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2,    // malloc with internal ram and psram in balance
-    AFE_MEMORY_ALLOC_MORE_PSRAM = 3                 // malloc with more psram
+    AFE_MEMORY_ALLOC_MORE_INTERNAL = 1,          // malloc with more internal ram
+    AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
+    AFE_MEMORY_ALLOC_MORE_PSRAM = 3              // malloc with more psram
 } afe_memory_alloc_mode_t;

 typedef enum {
-    AFE_MN_PEAK_AGC_MODE_1 = -9,            // The peak amplitude of fetch audio is -9dB
-    AFE_MN_PEAK_AGC_MODE_2 = -6,            // The peak amplitude of fetch audio is -6dB
-    AFE_MN_PEAK_AGC_MODE_3 = -3,            // The peak amplitude of fetcg is -3dB
-    AFE_MN_PEAK_NO_AGC = 0,                 // There is no agc gain
+    AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
+    AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
+    AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
+    AFE_MN_PEAK_NO_AGC = 0,      // There is no agc gain
 } afe_mn_peak_agc_mode_t;

 typedef struct {
-    int total_ch_num;                        // total channel num, include microphone channel, playback channel and unknown channel
-    int mic_num;                             // microphone channel number
-    uint8_t* mic_ids;                        // microphone channel indices
-    int ref_num;                             // playback reference channel number
-    uint8_t* ref_ids;                        // playback reference channel indices
-    int sample_rate;                         // sample rate of audio
+    int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
+    int mic_num;      // microphone channel number
+    uint8_t *mic_ids; // microphone channel indices
+    int ref_num;      // playback reference channel number
+    uint8_t *ref_ids; // playback reference channel indices
+    int sample_rate;  // sample rate of audio
 } afe_pcm_config_t;

 typedef enum {
-    AFE_NS_MODE_WEBRTC = 0,                        // please use model name of NS, SSP: "WEBRTC"
-    AFE_NS_MODE_NET = 1,                        // please use model name of NSNET
+    AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
+    AFE_NS_MODE_NET = 1,    // please use model name of NSNET
 } afe_ns_mode_t;

 typedef enum {
-    AFE_AGC_MODE_WEBRTC = 0,                        // WEBRTC AGC
-    AFE_AGC_MODE_WAKENET = 1,                       // AGC gain is calculated by wakenet model if wakenet is activated
+    AFE_AGC_MODE_WEBRTC = 0,  // WEBRTC AGC
+    AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
 } afe_agc_mode_t;

 /**
 * @brief Function to get the debug audio data
 *
- * @param data        The debug audio data which don't be modify. It should be copied away as soon as possible that avoid blocking for too long.
+ * @param data        The debug audio data which don't be modify. It should be copied away as soon as possible that
+ * avoid blocking for too long.
 * @param data_size   The number of bytes of data.
 * @returns
 */
-typedef void (*afe_debug_hook_callback_t)(const int16_t* data, int data_size);
+typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);

 typedef enum {
-    AFE_DEBUG_HOOK_MASE_TASK_IN = 0,        // To get the input data of mase task
-    AFE_DEBUG_HOOK_FETCH_TASK_IN = 1,       // To get the input data of fetch task
+    AFE_DEBUG_HOOK_MASE_TASK_IN = 0,  // To get the input data of mase task
+    AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
    AFE_DEBUG_HOOK_MAX = 2
 } afe_debug_hook_type_t;

 typedef struct {
-    afe_debug_hook_type_t hook_type;            // debug type of hook
-    afe_debug_hook_callback_t hook_callback;    // callback function which transfer debug audio data
+    afe_debug_hook_type_t hook_type;         // debug type of hook
+    afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
 } afe_debug_hook_t;

 typedef struct {
    /********** AEC(Acoustic Echo Cancellation) **********/
-    bool aec_init;                          // Whether to init aec
-    aec_mode_t aec_mode;                    // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
-    int aec_filter_length;                  // The filter length of aec
+    bool aec_init;         // Whether to init aec
+    aec_mode_t aec_mode;   // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
+    int aec_filter_length; // The filter length of aec

    /********** SE(Speech Enhancement, microphone array processing) **********/
-    bool se_init;                           // Whether to init se
+    bool se_init; // Whether to init se

    /********** NS(Noise Suppression) **********/
-    bool ns_init;                           // Whether to init ns
-    char *ns_model_name;                    // Model name of ns
-    afe_ns_mode_t afe_ns_mode;              // Model mode of ns
-    
+    bool ns_init;              // Whether to init ns
+    char *ns_model_name;       // Model name of ns
+    afe_ns_mode_t afe_ns_mode; // Model mode of ns
+
    /********** VAD(Voice Activity Detection) **********/
-    bool vad_init;                           // Whether to init vad
-    vad_mode_t vad_mode;                     // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
-    char *vad_model_name;                    // The model name of vad, If it is null, WebRTC VAD will be used.
-    int vad_min_speech_ms;                   // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
-    int vad_min_noise_ms;                    // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms
-    bool vad_mute_playback;                  // If true, the playback will be muted for vad detection. default: false
-    bool vad_enable_channel_trigger;         // If true, the vad will be used to choose the channel id. default: false
+    bool vad_init;          // Whether to init vad
+    vad_mode_t vad_mode;    // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
+    char *vad_model_name;   // The model name of vad, If it is null, WebRTC VAD will be used.
+    int vad_min_speech_ms;  // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
+    int vad_min_noise_ms;   // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
+                            // 1000 ms
+    bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
+    bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false

    /********** WakeNet(Wake Word Engine) **********/
    bool wakenet_init;
-    char *wakenet_model_name;               // The model name of wakenet 1
-    char *wakenet_model_name_2;             // The model name of wakenet 2 if has wakenet 2
-    det_mode_t wakenet_mode;                // The mode of wakenet
+    char *wakenet_model_name;   // The model name of wakenet 1
+    char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
+    det_mode_t wakenet_mode;    // The mode of wakenet

    /********** AGC(Automatic Gain Control) **********/
-    bool agc_init;                           // Whether to init agc
-    afe_agc_mode_t agc_mode;                     // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
-    int agc_compression_gain_db;             // Compression gain in dB (default 9)
-    int agc_target_level_dbfs;               // Target level in -dBfs of envelope (default -3)
+    bool agc_init; // Whether to init agc
+    afe_agc_mode_t
+        agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
+    int agc_compression_gain_db; // Compression gain in dB (default 9)
+    int agc_target_level_dbfs;   // Target level in -dBfs of envelope (default -3)

    /********** General AFE(Audio Front End) parameter **********/
-    afe_pcm_config_t pcm_config;            // Config the channel num of original data which is fed to the afe feed function.
-    afe_mode_t afe_mode;                    // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
-    afe_type_t afe_type;                    // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
-    int afe_perferred_core;                 // The preferred core of afe se task, which is created in afe_create function.
-    int afe_perferred_priority;             // The preferred priority of afe se task, which is created in afe_create function.
-    int afe_ringbuf_size;                   // The ring buffer size: the number of frame data in ring buffer.
-    afe_memory_alloc_mode_t memory_alloc_mode;  // The memory alloc mode for afe. From Internal RAM or PSRAM
-    float afe_linear_gain;                  // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude.
+    afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
+    afe_mode_t afe_mode;         // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    afe_type_t afe_type;         // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    int afe_perferred_core;      // The preferred core of afe se task, which is created in afe_create function.
+    int afe_perferred_priority;  // The preferred priority of afe se task, which is created in afe_create function.
+    int afe_ringbuf_size;        // The ring buffer size: the number of frame data in ring buffer.
+    afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
+    float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
+                           // directly on the output amplitude: out_linear_gain * amplitude.
    bool debug_init;
-    bool fixed_first_channel;                // If true, the channel after first wake-up is fixed to raw data of microphone
-                                             // otherwise, select channel number by wakenet
+    bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
+                              // otherwise, select channel number by wakenet
 } afe_config_t;

 /**
- * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format. 
- * You can manually fine-tune it after creating the configuration
- * 
- * The input format: 
+ * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
+ * on the chip target and input format. You can manually fine-tune it after creating the configuration
+ *
+ * The input format:
 * M to represent the microphone channel
 * R to represent the playback reference channel
 * N to represent an unknown or unused channel
- * 
- * For example, input_format="MMNR" indicates that the input data consists of four channels, 
+ *
+ * For example, input_format="MMNR" indicates that the input data consists of four channels,
 * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
- * 
+ *
 * @param input_format     The input format
 * @param models           Models from partition, which is configured by Kconfig
 * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
 * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
- * 
+ *
 * @return afe_config_t*  The default config of afe
 */
 afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);

 /**
 * @brief Check AFE configuration and make sure it is correct.
- * 
- * @warning If there is a configuration conflict, this function will modify some parameters. 
+ *
+ * @warning If there is a configuration conflict, this function will modify some parameters.
 * The guiding behind these modifications is to maintain the highest performance of the output audio and results.
 * And remove the conflict between different algorithms.
- * 
+ *
 * For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
 * If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
- * 
+ *
 * @param afe_config       Input AFE config
- * 
+ *
 * @return afe_config_t*  The modified AFE config
 */
 afe_config_t *afe_config_check(afe_config_t *afe_config);

 /**
 * @brief Parse input format
- * 
+ *
 * @param input_format The input format, same with afe_config_init() function
 * @param pcm_config   The pcm config
- * 
+ *
 * @return true if the input format is parsed successfully, otherwise false
 */
-bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config);
+bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);

 /**
 * @brief Parse I2S input data
- * 
+ *
 * @param data         The input multi channel data
 * @param frame_size   The frame size of input, it is also the size of single channel data
 * @param mic_data     The output microphone data
 * @param ref_data     The output playback reference data
 * @param pcm_config   The pcm config
- * 
+ *
 */
-void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config);
+void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);

 /**
 * @brief Parse input data, from interleaved arrangement to contiguous arrangement
- * 
+ *
 * @param data         The input multi channel data
 * @param frame_size   The frame size of input, it is also the size of single channel data
 * @param channel_num  The channel number of data
 * @param out_data     The output data
- * 
+ *
 */
-void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
+void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);

 /**
 * @brief Format input data, from contiguous arrangement to interleaved arrangement
- * 
+ *
 * @param data         The input multi channel data
 * @param frame_size   The frame size of input, it is also the size of single channel data
 * @param channel_num  The channel number of data
 * @param out_data     The output data
- * 
+ *
 */
-void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
+void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);

 /**
 * @brief Adjust the gain of input data
- * 
+ *
 * @warning the input data will be modified inplace.
- * 
+ *
 * @param data         The input audio data
 * @param frame_size   The frame size of input, it is also the size of single channel data
 * @param factor       The gain factor
- * 
+ *
 * @return int16_t*    The output audio data
 */
-int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
+int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);

 /**
 * @brief Adjust the gain of input data
- * 
+ *
 * @warning the input data will be modified inplace.
- * 
+ *
 * @param in_data         The input audio data
 * @param in_frame_size   Input data frame size of input
 * @param channel_num     The channel number of input data, which is same as output data
@ -243,35 +247,35 @@ int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
 * @param out_frame_size  Onput data frame size of input
 *
 */
-void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size);
+void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);

 /**
 * @brief Copy the afe config
- * 
+ *
 * @param dst_config    The destination afe config
 * @param src_config    The source afe config
- * 
+ *
 * @return   The destination afe config
 */
-afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
+afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);

 /**
 * @brief Print the afe config
- * 
+ *
 * @param afe_config    The afe config
 */
 void afe_config_print(const afe_config_t *afe_config);

 /**
 * @brief Allocate afe config
- * 
+ *
 * @return The afe config pointer
 */
 afe_config_t *afe_config_alloc();

 /**
 * @brief Free afe config
- * 
+ *
 * @param afe_config  The afe config pointer
 */
 void afe_config_free(afe_config_t *afe_config);
--- a/include/esp32s3/esp_afe_sr_iface.h
+++ b/include/esp32s3/esp_afe_sr_iface.h
@ -1,62 +1,61 @@
 #pragma once
+#include "esp_afe_config.h"
+#include "stdbool.h"
 #include "stdint.h"
 #include "stdlib.h"
-#include "stdbool.h"
-#include "esp_afe_config.h"
 #include "freertos/FreeRTOS.h"
 #include "freertos/task.h"
 #ifdef __cplusplus
 extern "C" {
 #endif

-//AFE: Audio Front-End 
-//SR:  Speech Recognition
-//afe_sr/AFE_SR: the audio front-end for speech recognition
+// AFE: Audio Front-End
+// SR:  Speech Recognition
+// afe_sr/AFE_SR: the audio front-end for speech recognition

-//Opaque AFE_SR data container
+// Opaque AFE_SR data container
 typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;

-
-
 /**
 * @brief The state of vad
 */
-typedef enum
-{
-    AFE_VAD_SILENCE = 0,                    // Deprecated, please use vad_state_t, noise or silence
-    AFE_VAD_SPEECH = 1                      // Deprecated, please use vad_state_t, speech
+typedef enum {
+    AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
+    AFE_VAD_SPEECH = 1   // Deprecated, please use vad_state_t, speech
 } afe_vad_state_t;

 /**
 * @brief The result of fetch function
 */
-typedef struct afe_fetch_result_t
-{
-    int16_t *data;                          // the target channel data of audio.
-    int data_size;                          // the size of data. The unit is byte.
-    int16_t *vad_cache;                     // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
-    int vad_cache_size;                     // the size of vad_cache. The unit is byte.
-    float data_volume;                      // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc).
-                                            // if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length. 
-    wakenet_state_t wakeup_state;           // the value is wakenet_state_t
-    int wake_word_index;                    // if the wake word is detected. It will store the wake word index which start from 1.
-    int wakenet_model_index;                // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
-    vad_state_t vad_state;              // the value is afe_vad_state_t
-    int trigger_channel_id;                 // the channel index of output
-    int wake_word_length;                   // the length of wake word. The unit is the number of samples.
-    int ret_value;                          // the return state of fetch function
-    int16_t *raw_data;                      // the multi-channel output data of audio.
-    int raw_data_channels;                  // the channel number of raw data
-    void* reserved;                         // reserved for future use
+typedef struct afe_fetch_result_t {
+    int16_t *data;      // the target channel data of audio.
+    int data_size;      // the size of data. The unit is byte.
+    int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
+                        // audio that was truncated.
+    int vad_cache_size; // the size of vad_cache. The unit is byte.
+    float data_volume;  // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
+                        // (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
+                        // wakenet(about 1.5s), otherwise is the frame length.
+    wakenet_state_t wakeup_state; // the value is wakenet_state_t
+    int wake_word_index;          // if the wake word is detected. It will store the wake word index which start from 1.
+    int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
+                             // start from 1.
+    vad_state_t vad_state;   // the value is afe_vad_state_t
+    int trigger_channel_id;  // the channel index of output
+    int wake_word_length;    // the length of wake word. The unit is the number of samples.
+    int ret_value;           // the return state of fetch function
+    int16_t *raw_data;       // the multi-channel output data of audio.
+    int raw_data_channels;   // the channel number of raw data
+    void *reserved;          // reserved for future use
 } afe_fetch_result_t;

 /**
 * @brief Function to initialze a AFE_SR instance
- * 
+ *
 * @param afe_config        The config of AFE_SR
 * @returns Handle to the AFE_SR data
 */
-typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
+typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);

 /**
 * @brief Get the amount of each channel samples per frame that need to be passed to the function
@ -71,7 +70,7 @@ typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);

 /**
 * @brief Get the channel number
- * 
+ *
 * @param afe   The AFE_SR object to query
 * @return      The amount of total channels
 */
@ -92,12 +91,12 @@ typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
 *           The last channel is reference signal if it has reference data.
 *
 * @param afe   The AFE_SR object to query
- * 
- * @param in    The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the 
+ *
+ * @param in    The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
 *              `get_feed_chunksize`.
 * @return      The size of input
 */
-typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in);
+typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);

 /**
 * @brief fetch enhanced samples of an audio stream from the AFE_SR
@ -106,9 +105,10 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t*
 *           Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
 *
 * @param afe   The AFE_SR object to query
- * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
+ * audio can be queried by the `get_fetch_chunksize`.)
 */
-typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
+typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);

 /**
 * @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
@ -117,9 +117,10 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *af
 *
 * @param afe            The AFE_SR object to query
 * @param ticks_to_wait  The timeout value, in ticks, to wait for the fetch result.
- * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
+ * audio can be queried by the `get_fetch_chunksize`.)
 */
-typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
+typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);

 /**
 * @brief reset ringbuf of AFE.
@ -130,14 +131,14 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr
 typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);

 /**
- * @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient 
+ * @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
 *        when wakenet has been initialized. It's only support wakenet 1 now.
 *
 * @param afe                The AFE_SR object to query
 * @param wakenet_word       The wakenet word, should be DEFAULT_WAKE_WORD or EXTRA_WAKE_WORD
 * @return             -1: fail, 1: success
 */
-typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name);
+typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name);

 /**
 * @brief Enable VAD algorithm.
@ -179,7 +180,6 @@ typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
 */
 typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);

-
 /**
 * This structure contains the functions used to do operations on a AFE_SR.
 */
@ -191,11 +191,11 @@ typedef struct {
    esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
    esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
    esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
-    esp_afe_sr_iface_op_get_channel_num_t get_channel_num;        // same with get_feed_channel_num
+    esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
    esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
    esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
    esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
-    esp_afe_sr_iface_op_set_wakenet_t  set_wakenet; 
+    esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
    esp_afe_sr_iface_op_disable_func_t disable_wakenet;
    esp_afe_sr_iface_op_enable_func_t enable_wakenet;
    esp_afe_sr_iface_op_disable_func_t disable_aec;
@ -212,16 +212,14 @@ typedef struct {
    esp_afe_sr_iface_op_destroy_t destroy;
 } esp_afe_sr_iface_t;

-
 // struct is used to store the AFE handle and data for the AFE task
-typedef struct 
-{
+typedef struct {
    esp_afe_sr_data_t *afe_data;
    esp_afe_sr_iface_t *afe_handle;
-    TaskHandle_t feed_task; 
-    TaskHandle_t fetch_task; 
-}afe_task_into_t;
+    TaskHandle_t feed_task;
+    TaskHandle_t fetch_task;
+} afe_task_into_t;

 #ifdef __cplusplus
 }
-#endif
+#endif
--- a/include/esp32s3/esp_afe_sr_models.h
+++ b/include/esp32s3/esp_afe_sr_models.h
@ -10,4 +10,4 @@ esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);

 #ifdef __cplusplus
 }
-#endif
+#endif
--- a/include/esp32s3/esp_mfcc_iface.h
+++ b/include/esp32s3/esp_mfcc_iface.h
@ -0,0 +1,89 @@
+#pragma once
+#include <stdint.h>
+#include "esp_speech_features.h"
+
+/*
+This describes an interface for a MFCC runner, that is, some kind of implementation that can be
+fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so
+multiple implementations can be used.
+*/
+
+
+typedef struct esp_mfcc_data_t esp_mfcc_data_t;
+
+
+//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please
+//refer to its documentation for details.
+typedef struct {
+    int winstep_ms;     // The step between successive windows in ms. (10)
+    int winlen_ms;      // The length of the analysis window in ms. (25)
+    int nch;            // The number of input channel
+    int numcep;         // The number of cepstrum to return
+    int nfilter;        // The number of filters in the filterbank
+    int nfft;           // The FFT size
+    int samp_freq;      // The sample-rate of the signal.
+    int low_freq;       // The lowest band edge of mel filters, in hz. (e.g. 0)
+    int high_freq;      // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
+    float preemph;      // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
+    char *win_type;     // Analysis window type to apply to each frame， "hanning","hamming","sine","rectangular","povey"
+    bool append_energy; //　If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
+    bool use_power;     // If true, use power of fft spectrum, else use magnitude of fft spectrum
+    int use_log_fbank;  // 0: return fbank, 1:  return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
+    float log_epsilon;  // log epsilon. (e.g. 1e-7) 
+    bool psram_first;   // Alloc memory from PSRAM first
+    bool remove_dc_offset;  // Whether to subtract mean of wave before FFT
+} esp_mfcc_opts_t;
+
+
+/**
+ * @brief Un-initialize and free a mfcc runner
+ *
+ * Function to free a previously allocated mfcc runner.
+ *
+ * @param r Runner object to destroy
+ */
+typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Initialize parameters for a mfcc runner.
+ *
+ * After creation, a mfcc runner needs to be initialized first; this is usually done
+ * in the initialization routine of a speech recognition algorithm. This provides
+ * a pointer to do this for a specific mfcc runner.
+ *
+ * @param opt Options for the mfcc process
+ * @return True if success, false on error.
+ */
+typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
+
+/**
+ * @brief Run a mfcc iteration on frame by frame
+ *
+ * This will take a set of samples and return a ceptrum. Note that this may be pipelined:
+ * an initial call to this function may return NULL and subsequent calls may return the 
+ * cepstrum of previous calls.
+ *
+ * @param r The mfcc runner
+ * @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000).
+ * @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function
+ *         when done with this buffer. Note that some implementations require the buffer to be freed before another call
+ *         to this function is done.
+ */
+typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
+
+/**
+ * @brief Clean all state of mfcc handle
+ *
+ * @param r The mfcc runner
+ */
+typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r);
+
+/**
+ * @brief Operations possible on a mfcc runner
+ */
+typedef struct {
+    esp_mfcc_op_destroy_t destroy;
+    esp_mfcc_op_create_t create;
+    esp_mfcc_op_run_step_t run_step;
+    esp_mfcc_op_clean_t clean;
+} esp_mfcc_iface_t;
--- a/include/esp32s3/esp_mfcc_models.h
+++ b/include/esp32s3/esp_mfcc_models.h
@ -0,0 +1,40 @@
+#pragma once
+#include "esp_mfcc_iface.h"
+
+
+extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle
+
+
+/**
+ * @brief Return basic opts used in wakenet9 & multinet5
+ **/
+esp_mfcc_opts_t *get_mfcc_opts_wn9();
+
+/**
+ * @brief Return basic opts for default kaldifeat 
+ * 
+    opts->psram_first = true;
+    opts->use_power = true;
+    opts->use_log_fbank = 2;  // log(max(x, log_epsilon))
+    opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps
+    opts->win_type = "povey";
+    opts->low_freq = 20;
+    opts->high_freq = 7600;
+    opts->samp_freq = 16000;
+    opts->nch = 1;
+    opts->nfft = 512;
+    opts->nfilter = 80;
+    opts->numcep = 80;
+    opts->preemph = 0.97;
+    opts->append_energy = false;
+    opts->winlen_ms = 25;
+    opts->winstep_ms = 10;
+    opts->remove_dc_offset = true;
+ *
+ **/
+esp_mfcc_opts_t *get_mfcc_opts_kaldi();
+
+/**
+ * @brief Print mfcc opts
+ **/
+void print_mfcc_opts(esp_mfcc_opts_t *opts);
--- a/include/esp32s3/esp_speech_features.h
+++ b/include/esp32s3/esp_speech_features.h
@ -0,0 +1,64 @@
+#pragma once
+#include "c_speech_features_config.h"
+#include "stdlib.h"
+#include <assert.h>
+#include <stdbool.h>
+
+#ifndef M_2PI
+#define M_2PI 6.283185307179586476925286766559005
+#endif
+
+typedef struct 
+{
+    float *coeff;
+    int *bank_pos;
+    int nfilter;
+} esp_mel_filter_t;
+
+float* esp_mfcc_malloc(size_t size, bool from_psram);
+
+void esp_mfcc_free(void *ptr);
+
+/**
+ * @brief Initialize FFT table
+ * @warning For ESP-PLATFORM, use esp-dsp fft
+ *          For Other platform, use kiss fft  
+ * 
+ * @param nfft  The input samples number 
+ * @return fft-table
+ **/
+void* esp_fft_init(int nfft);
+
+/**
+ * @brief Free FFT table
+ * @warning For ESP-PLATFORM, use esp-dsp fft
+ *          For Other platform, use kiss fft  
+ * 
+ * @param fft_table  The fft table initialized by esp_fft_init
+ * @param nfft       The input samples number 
+ * @return fft-table
+ **/
+void esp_fft_deinit(void *fft_table, int nfft);
+
+/**
+ * @brief Initial window function
+ *        Currently support hanning, hamming, sine, povey, rectangular, 
+ *        wn9(512-hanning to get wakenet9& multinet5 compatible)
+ **/
+float *esp_win_func_init(char *win_type, float* window_data, int frame_length);
+
+float* esp_fftr(float* x, int nfft, void *fft_table);
+
+float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
+
+void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
+
+float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);
+
+esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, 
+                                      bool from_psram);
+
+void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);
+
+float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, 
+                                float epsilon);
--- a/lib/esp32/libc_speech_features.a
+++ b/lib/esp32/libc_speech_features.a
--- a/lib/esp32/libesp_audio_front_end.a
+++ b/lib/esp32/libesp_audio_front_end.a
--- a/lib/esp32/libesp_audio_processor.a
+++ b/lib/esp32/libesp_audio_processor.a
--- a/lib/esp32/libmultinet.a
+++ b/lib/esp32/libmultinet.a
--- a/lib/esp32/libwakenet.a
+++ b/lib/esp32/libwakenet.a
--- a/lib/esp32p4/libc_speech_features.a
+++ b/lib/esp32p4/libc_speech_features.a
--- a/lib/esp32p4/libesp_audio_front_end.a
+++ b/lib/esp32p4/libesp_audio_front_end.a
--- a/lib/esp32p4/libesp_audio_processor.a
+++ b/lib/esp32p4/libesp_audio_processor.a
--- a/lib/esp32p4/libmultinet.a
+++ b/lib/esp32p4/libmultinet.a
--- a/lib/esp32p4/libvadnet.a
+++ b/lib/esp32p4/libvadnet.a
--- a/lib/esp32p4/libwakenet.a
+++ b/lib/esp32p4/libwakenet.a
--- a/lib/esp32s3/libc_speech_features.a
+++ b/lib/esp32s3/libc_speech_features.a
--- a/lib/esp32s3/libesp_audio_front_end.a
+++ b/lib/esp32s3/libesp_audio_front_end.a
--- a/lib/esp32s3/libesp_audio_processor.a
+++ b/lib/esp32s3/libesp_audio_processor.a
--- a/lib/esp32s3/libmultinet.a
+++ b/lib/esp32s3/libmultinet.a
--- a/lib/esp32s3/libnsnet.a
+++ b/lib/esp32s3/libnsnet.a
--- a/lib/esp32s3/libvadnet.a
+++ b/lib/esp32s3/libvadnet.a
--- a/lib/esp32s3/libwakenet.a
+++ b/lib/esp32s3/libwakenet.a
--- a/test_apps/esp-sr/main/CMakeLists.txt
+++ b/test_apps/esp-sr/main/CMakeLists.txt
@ -4,6 +4,7 @@ set(srcs
    "test_wakenet.cpp"
    "test_multinet.cpp"
    "test_afe.cpp"
+    "test_mfcc.cpp"
 )

 idf_component_register(SRCS ${srcs}
--- a/test_apps/esp-sr/main/test_mfcc.cpp
+++ b/test_apps/esp-sr/main/test_mfcc.cpp
@ -0,0 +1,63 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include "string.h"
+#include <limits.h>
+#include "unity.h"
+#include "esp_log.h"
+#include "esp_heap_caps.h"
+#include "esp_mfcc_iface.h"
+#include "esp_mfcc_models.h"
+#include "alexa.h"
+
+esp_mfcc_opts_t *get_fbank_opts_kaldi(int dim)
+{
+    esp_mfcc_opts_t *opts = (esp_mfcc_opts_t*)malloc(sizeof(esp_mfcc_opts_t));
+    opts->psram_first = true;
+    opts->use_power = true;
+    opts->use_log_fbank = 0;  // log(max(x, log_epsilon))
+    opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps
+    opts->win_type = const_cast<char*>("hanning"); // remove [-Wwrite-strings] warning
+    opts->low_freq = 20;
+    opts->high_freq = 7600;
+    opts->samp_freq = 16000;
+    opts->nch = 1;
+    opts->nfft = 512;
+    opts->nfilter = 80;
+    opts->numcep = dim;
+    opts->preemph = 0.97;
+    opts->append_energy = false;
+    opts->winlen_ms = 25;
+    opts->winstep_ms = 10;
+    opts->remove_dc_offset = true;
+
+    return opts;
+}
+
+
+TEST_CASE("test speech features", "[fbank]")
+{
+    int16_t *buffer = (int16_t *) malloc(512 * sizeof(int16_t));
+    const esp_mfcc_iface_t *fbank_handle = &esp_fbank_f32;
+    float* fbank_out = NULL;
+
+    int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
+    // MFCC init
+    int out_dim = 80;
+    esp_mfcc_data_t *fbank_model = fbank_handle->create(get_fbank_opts_kaldi(out_dim));
+    memcpy(buffer, alexa, 512 * sizeof(int16_t));
+    fbank_out = fbank_handle->run_step(fbank_model, buffer, 0);
+    fbank_handle->destroy(fbank_model);
+    int first_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
+    for (int i = 0; i < out_dim; i++) {
+        printf("%f ", fbank_out[i]);
+    }
+    
+    fbank_model = fbank_handle->create(get_fbank_opts_kaldi(out_dim));
+    memcpy(buffer, alexa, 512 * sizeof(int16_t));
+    fbank_out = fbank_handle->run_step(fbank_model, buffer, 0);
+    fbank_handle->destroy(fbank_model);
+    int second_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
+    
+    TEST_ASSERT_EQUAL(true, start_size - first_end_size < 100); // there are some memory leak in system
+    TEST_ASSERT_EQUAL(true, first_end_size == second_end_size);
+}