feat(esp32p4): add vadnet1

2025-09-15 15:28:44 +08:00 · 2024-12-31 20:26:04 +08:00 · 2024-12-31 20:26:04 +08:00 · d72ed551cb
commit d72ed551cb
parent 404fa46e38
21 changed files with 296 additions and 208 deletions
--- a/Kconfig.projbuild
+++ b/Kconfig.projbuild
@ -13,14 +13,9 @@ choice MODEL_DATA_PATH
 endchoice


-config USE_AFE
-	bool "use afe"
-	default "y"
-
 choice AFE_INTERFACE_SEL
 	prompt "Afe interface"
 	default AFE_INTERFACE_V1
-	depends on USE_AFE
 	help
 		Select the afe interface to be used.

@ -29,187 +24,60 @@ choice AFE_INTERFACE_SEL

 endchoice

-config USE_NSNET
-    bool "use nsnet"
-    default "n"
-
 choice SR_NSN_MODEL_LOAD
-    prompt "Select deep noise suppression"
-    default SR_NSN_NSNET2
-    depends on USE_NSNET
+    prompt "Select noise suppression model"
+    default SR_NSN_WEBRTC
    help
-        Select the deep noise suppression to be loaded.
+        Select the noise suppression model to be loaded.

-    config SR_NSN_NONE
-        bool "None"
+    config SR_NSN_WEBRTC
+        bool "noise suppression (WebRTC)"

-    config SR_NSN_NSNET1
-        bool "Deep noise suppression v1 (nsnet1)"
-        depends on IDF_TARGET_ESP32S3
    config SR_NSN_NSNET2
        bool "Deep noise suppression v2 (nsnet2)"
-	depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+	    depends on IDF_TARGET_ESP32S3 ||  IDF_TARGET_ESP32P4
 endchoice

-config USE_WAKENET
-    bool "use wakenet"
-    default "y"
+choice SR_VAD_MODEL_LOAD
+    prompt "Select voice activity detection"
+    default SR_VADNET_WEBRTC
+    help
+        Select the vad model to be loaded.
+
+    config SR_VAD_WEBRTC
+        bool "voice activity detection (WebRTC)"
+    
+    config SR_VADNET1_MODLE_MEDIUM
+        bool "voice activity detection (vadnet1 medium)"
+	    depends on IDF_TARGET_ESP32S3 ||  IDF_TARGET_ESP32P4
+endchoice

 choice SR_WN_MODEL_LOAD
    prompt "Select wake words"
-    default SR_WN_WN9_HILEXIN
-    depends on USE_WAKENET
+    default SR_WN_WN5_HILEXIN
+    depends on IDF_TARGET_ESP32
    help
        Select the Wake Words to be loaded.

    config SR_WN_WN5_HILEXIN
-        bool "Hi,乐鑫 (wn5_hilexin)"
-        depends on IDF_TARGET_ESP32
+        bool "Hi,Lexin (wn5_hilexin)"

    config SR_WN_WN5X3_HILEXIN
-        bool "Hi,乐鑫 (wn5_hilexinX3)"
-        depends on IDF_TARGET_ESP32
+        bool "Hi,Lexin (wn5_hilexinX3)"

    config SR_WN_WN5_NIHAOXIAOZHI
-        bool "你好小智 (wn5_nihaoxiaozhi)"
-        depends on IDF_TARGET_ESP32
+        bool "nihaoxiaozhi (wn5_nihaoxiaozhi)"

    config SR_WN_WN5X3_NIHAOXIAOZHI
-        bool "你好小智 (wn5_nihaoxiaozhiX3)"
-        depends on IDF_TARGET_ESP32
+        bool "nihaoxiaozhi (wn5_nihaoxiaozhiX3)"

    config SR_WN_WN5X3_NIHAOXIAOXIN
-        bool "你好小鑫 (wn5_nihaoxiaoxinX3)"
-        depends on IDF_TARGET_ESP32
-
-    config SR_WN_WN8_ALEXA
-        bool "Alexa (wn8_alexa)"
-        depends on IDF_TARGET_ESP32S3
-
-    config SR_WN_WN9_HILEXIN
-        bool "Hi,乐鑫 (wn9_hilexin)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_XIAOAITONGXUE
-        bool "小爱同学 (wn9_xiaoaitongxue)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_ALEXA
-        bool "Alexa (wn9_alexa)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_HIESP
-        bool "Hi,ESP (wn9_hiesp)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_HIMFIVE
-        bool "Hi,M Five (wn9_himfive)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-    
-    config SR_WN_WN9_NIHAOXIAOZHI_TTS
-        bool "你好小智 (wn9_nihaoxiaozhi_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-    
-    config SR_WN_WN9_JARVIS_TTS
-        bool "Jarvis (wn9_jarvis_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-    
-    config SR_WN_WN9_COMPUTER_TTS
-        bool "computer (wn9_computer_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_HEYWILLOW_TTS
-        bool "Hey,Willow (wn9_heywillow_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_SOPHIA_TTS
-        bool "Sophia (wn9_sophia_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_NIHAOXIAOXIN_TTS
-        bool "你好小鑫 (wn9_nihaoxiaoxin_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_XIAOMEITONGXUE_TTS
-        bool "小美同学 (wn9_xiaomeitongxue_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_HIXIAOXING_TTS
-        bool "Hi,小星 (wn9_hixiaoxing_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_MYCROFT_TTS
-        bool "Mycroft (wn9_mycroft_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_HEYPRINTER_TTS
-        bool "Hey,Printer (wn9_heyprinter_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_XIAOLONGXIAOLONG_TTS
-        bool "小龙小龙 (wn9_xiaolongxiaolong_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_MIAOMIAOTONGXUE_TTS
-        bool "喵喵同学 (wn9_miaomiaotongxue_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_HIJOY_TTS
-        bool "Hi,Joy (wn9_hijoy_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_HILILI_TTS
-        bool "Hi,Lily/Hi,莉莉 (wn9_hilili_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_HITELLY_TTS
-        bool "Hi,Telly/Hi,泰力 (wn9_hitelly_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_HEYWANDA_TTS
-        bool "Hey,Wanda (wn9_heywanda_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_HIMIAOMIAO_TTS
-        bool "Hi,喵喵 (wn9_himiaomiao_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_XIAOBINXIAOBIN_TTS
-        bool "小滨小滨/小冰小冰 (wn9_xiaobinxiaobin_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_HAIXIAOWU_TTS
-        bool "Hi,小巫 (wn9_haixiaowu_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_ASTROLABE_TTS
-        bool "Astrolabe (wn9_astrolabe_tts)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_XIAOYAXIAOYA_TTS2
-        bool "小鸭小鸭 (wn9_xiaoyaxiaoya_tts2)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-    
-    config SR_WN_WN9_HIJASON_TTS2
-        bool "Hi,Jason (wn9_hijason_tts2)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_LINAIBAN_TTS2
-        bool "璃奈板 (wn9_linaiban_tts2)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_WN9_CUSTOMWORD
-        bool "customized word (wn9_customword)"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
-
-    config SR_WN_LOAD_MULIT_WORD
-        bool "Load Multiple Wake Words"
-        depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+        bool "nihaoxiaoxin (wn5_nihaoxiaoxinX3)"

 endchoice

 menu "Load Multiple Wake Words"
-    depends on SR_WN_LOAD_MULIT_WORD
+    depends on IDF_TARGET_ESP32S3 ||  IDF_TARGET_ESP32P4

    config SR_WN_WN9_HILEXIN_MULTI
    bool "Hi,乐鑫 (wn9_hilexin)"
@ -241,94 +109,90 @@ menu "Load Multiple Wake Words"

    config SR_WN_WN9_HEYWILLOW_TTS_MULTI
    bool "Hey,Willow (wn9_heywillow_tts)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

    config SR_WN_WN9_SOPHIA_TTS_MULTI
    bool "Sophia (wn9_sophia_tts)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

    config SR_WN_WN9_NIHAOXIAOXIN_TTS_MULTI
    bool "你好小鑫 (wn9_nihaoxiaoxin_tts)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

    config SR_WN_WN9_XIAOMEITONGXUE_TTS_MULTI
    bool "小美同学 (wn9_xiaomeitongxue_tts)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

    config SR_WN_WN9_HEYPRINTER_TTS_MULTI
    bool "Hey,Printer (wn9_heyprinter_tts)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

    config SR_WN_WN9_XIAOLONGXIAOLONG_TTS_MULTI
    bool "小龙小龙 (wn9_xiaolongxiaolong_tts)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

    config SR_WN_WN9_MIAOMIAOTONGXUE_TTS_MULTI
    bool "喵喵同学 (wn9_miaomiaotongxue_tts)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False


    config SR_WN_WN9_HEYWANDA_TTS_MULTI
    bool "Hey,Wanda (wn9_heywanda_tts)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

    config SR_WN_WN9_HIMIAOMIAO_TTS_MULTI
    bool "Hi,喵喵 (wn9_himiaomiao_tts)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False


    config SR_WN_WN9_MYCROFT_TTS_MULTI
    bool "Mycroft (wn9_mycroft_tts)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

    config SR_WN_WN9_HIJOY_TTS_MULTI
    bool "Hi,Joy (wn9_hijoy_tts)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

    config SR_WN_WN9_HILILI_TTS_MULTI
    bool "Hi,Lily/Hi,莉莉 (wn9_hilili_tts)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

    config SR_WN_WN9_HITELLY_TTS_MULTI
    bool "Hi,Telly/Hi,泰力 (wn9_hitelly_tts)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

    config SR_WN_WN9_XIAOBINXIAOBIN_TTS_MULTI
    bool "小滨小滨/小冰小冰 (wn9_xiaobinxiaobin_tts)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

    config SR_WN_WN9_HAIXIAOWU_TTS_MULTI
    bool "Hi,小巫 (wn9_haixiaowu_tts)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

    config SR_WN_WN9_ASTROLABE_TTS_MULTI
    bool "Astrolabe (wn9_astrolabe_tts)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

    config SR_WN_WN9_XIAOYAXIAOYA_TTS2_MULTI
    bool "小鸭小鸭 (wn9_xiaoyaxiaoya_tts2)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

    config SR_WN_WN9_HIJASON_TTS2_MULTI
    bool "Hi,Jason (wn9_hijason_tts2)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

    config SR_WN_WN9_LINAIBAN_TTS2_MULTI
    bool "璃奈板 (wn9_linaiban_tts2)"
-    depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
+    default False

 endmenu

-config USE_MULTINET
-    bool "use multinet"
-    default "y"

 choice CHINESE_SR_MN_MODEL_SEL
    prompt "Chinese Speech Commands Model"
-    default SR_MN_CN_MULTINET6_QUANT
-    depends on USE_MULTINET
+    default SR_MN_CN_NONE
    help
-        Select the Wake Word Engine to be used.
+        Select the Chinese Speech Commands Model.

    config SR_MN_CN_NONE
        bool "None"
@ -362,9 +226,8 @@ endchoice
 choice ENGLISH_SR_MN_MODEL_SEL
    prompt "English Speech Commands Model"
    default SR_MN_EN_NONE
-    depends on USE_MULTINET
    help
-        Select the Wake Word Engine to be used.
+        Select the English Speech Commands Model.

    config SR_MN_EN_NONE
        bool "None"
--- a/include/esp32p4/esp_afe_config.h
+++ b/include/esp32p4/esp_afe_config.h
@ -92,6 +92,10 @@ typedef struct {
    char *afe_ns_model_name;
    bool fixed_first_channel;                // If true, the channel after first wake-up is fixed to raw data of microphone
                                             // otherwise, select channel number by wakenet
+    char *vad_model_name;                    // The model name of vad, support vadnet1 and vadnet1_small
+    int vad_min_speech_ms;                   // The minimum duration of speech in ms. It should be bigger than 32 ms
+    int vad_min_noise_ms;                    // The minimum duration of noise/silence in ms. It should be bigger than 64 ms
+    bool vad_mute_playback;                  // If true, the playback will be muted for vad detection
 } afe_config_t;


@ -126,6 +130,10 @@ typedef struct {
    .afe_ns_mode = NS_MODE_SSP, \
    .afe_ns_model_name = NULL, \
    .fixed_first_channel = true, \
+    .vad_model_name = NULL, \
+    .vad_min_speech_ms = 64, \
+    .vad_min_noise_ms = 256, \
+    .vad_mute_playback = false, \
 }
 #elif CONFIG_IDF_TARGET_ESP32P4
 #define AFE_CONFIG_DEFAULT() { \
@ -158,6 +166,10 @@ typedef struct {
    .afe_ns_mode = NS_MODE_SSP, \
    .afe_ns_model_name = NULL, \
    .fixed_first_channel = true, \
+    .vad_model_name = NULL, \
+    .vad_min_speech_ms = 64, \
+    .vad_min_noise_ms = 256, \
+    .vad_mute_playback = false, \
 }
 #elif CONFIG_IDF_TARGET_ESP32S3
 #define AFE_CONFIG_DEFAULT() { \
@ -190,6 +202,10 @@ typedef struct {
    .afe_ns_mode = NS_MODE_SSP, \
    .afe_ns_model_name = NULL, \
    .fixed_first_channel = true, \
+    .vad_model_name = NULL, \
+    .vad_min_speech_ms = 64, \
+    .vad_min_noise_ms = 256, \
+    .vad_mute_playback = false, \
 }
 #endif

--- a/include/esp32p4/esp_afe_sr_iface.h
+++ b/include/esp32p4/esp_afe_sr_iface.h
@ -29,6 +29,8 @@ typedef struct afe_fetch_result_t
 {
    int16_t *data;                          // the data of audio.
    int data_size;                          // the size of data. The unit is byte.
+    int16_t *vad_cache;                     // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
+    int vad_cache_size;                     // the size of vad_cache. The unit is byte.
    float data_volume;                      // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc).
                                            // if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length. 
    wakenet_state_t wakeup_state;           // the value is wakenet_state_t
@ -36,7 +38,7 @@ typedef struct afe_fetch_result_t
    int wakenet_model_index;                // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
    afe_vad_state_t vad_state;              // the value is afe_vad_state_t
    int trigger_channel_id;                 // the channel index of output
-    int wake_word_length;                   // the length of wake word. It's unit is the number of samples.
+    int wake_word_length;                   // the length of wake word. The unit is the number of samples.
    int ret_value;                          // the return state of fetch function
    void* reserved;                         // reserved for future use
 } afe_fetch_result_t;
--- a/include/esp32p4/esp_afe_sr_models.h
+++ b/include/esp32p4/esp_afe_sr_models.h
@ -4,7 +4,6 @@
 extern "C" {
 #endif

-#if defined CONFIG_USE_AFE
 #include "esp_afe_sr_iface.h"


@ -19,17 +18,6 @@ extern const esp_afe_sr_iface_t esp_afe_vc_v1;
 #endif


-#else
-
-
-#include "esp_afe_sr_iface.h"
-extern const esp_afe_sr_iface_t esp_afe_sr_v1;
-extern const esp_afe_sr_iface_t esp_afe_vc_v1;
-#define ESP_AFE_SR_HANDLE esp_afe_sr_v1
-#define ESP_AFE_VC_HANDLE esp_afe_vc_v1
-
-#endif
-
 #ifdef __cplusplus
 }
 #endif
--- a/include/esp32p4/esp_vad.h
+++ b/include/esp32p4/esp_vad.h
@ -25,22 +25,65 @@ extern "C" {

 /**
 * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
- * restrictive in reporting speech.
+ * restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
 */
 typedef enum {
-    VAD_MODE_0 = 0,
-    VAD_MODE_1,
-    VAD_MODE_2,
-    VAD_MODE_3,
-    VAD_MODE_4
+    VAD_MODE_0 = 0,  // Normal
+    VAD_MODE_1,      // Aggressive
+    VAD_MODE_2,      // Very Aggressive
+    VAD_MODE_3,      // Very Very Aggressive
+    VAD_MODE_4       // Very Very Very Aggressive
 } vad_mode_t;

 typedef enum {
    VAD_SILENCE = 0,
-    VAD_SPEECH
+    VAD_SPEECH = 1,
 } vad_state_t;

-typedef void* vad_handle_t;
+typedef struct vad_trigger_tag {
+    vad_state_t state;
+    unsigned int min_speech_len;
+    unsigned int noise_len;
+    unsigned int min_noise_len;
+    unsigned int speech_len;
+} vad_trigger_t;
+
+#define vad_MAX_LEN INT32_MAX - 1
+/**
+ * @brief Allocate wakenet trigger
+ * 
+ * @param min_speech_len  Minimum frame number of speech duration
+ * @param min_noise_len   Minimum frame number of noise duration
+ * 
+ * @return Trigger pointer
+ **/
+vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
+
+/**
+ * @brief Free wakenet trigger
+ **/
+void vad_trigger_free(vad_trigger_t *trigger);
+
+/**
+ * @brief Reset wakenet trigger
+ **/
+void vad_trigger_reset(vad_trigger_t *trigger);
+
+/**
+ * @brief detect activaty voice by trigger
+ **/
+vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
+
+
+typedef struct {
+    vad_trigger_t *trigger;
+    void *vad_inst;
+}vad_handle_with_trigger_t;
+
+typedef vad_handle_with_trigger_t* vad_handle_t;
+
+// typedef vad_handle_tag * vad_handle_t;
+

 /**
 * @brief Creates an instance to the VAD structure.
@ -53,6 +96,18 @@ typedef void* vad_handle_t;
 */
 vad_handle_t vad_create(vad_mode_t vad_mode);

+/**
+ * @brief Creates an instance to the VAD structure.
+ *
+ * @param vad_mode          Sets the VAD operating mode.
+ * @param min_speech_len    Minimum frame number of speech duration
+ * @param min_noise_len     Minimum frame number of noise duration
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of VAD
+ */
+vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len);
+
 /**
 * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
 *
--- a/include/esp32p4/esp_vadn_iface.h
+++ b/include/esp32p4/esp_vadn_iface.h
@ -0,0 +1,142 @@
+#pragma once
+#include "esp_vad.h"
+#include "stdint.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque model data container
+typedef struct model_iface_data_t model_iface_data_t;
+
+// /**
+//  * @brief The state of vad
+//  */
+// typedef enum {
+//     VAD_NOISE = -1,  // Noise
+//     VADNET_STATE_SILENCE = 0, // Silence
+//     VAD_SPEECH = 1   // Speech
+// } vad_state_t;
+
+/**
+ * @brief Easy function type to initialze a model instance with a detection mode
+ * and specified model name
+ *
+ * @param model_name  The specified model name
+ * @param mode        The voice activity detection mode
+ * @param channel_num The number of input audio channels
+ * @param min_speech_ms  The minimum duration of speech in ms to trigger vad
+ * speech
+ * @param min_noise_ms   The minimum duration of noise in ms to trigger vad
+ * noise
+ * @returns Handle to the model data
+ */
+typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)(
+    const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms);
+
+/**
+ * @brief Get the amount of samples that need to be passed to the detect
+ * function
+ *
+ * Every speech recognition model processes a certain number of samples at the
+ * same time. This function can be used to query that amount. Note that the
+ * returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the channel number of samples that need to be passed to the detect
+ * function
+ *
+ * Every speech recognition model processes a certain number of samples at the
+ * same time. This function can be used to query that amount. Note that the
+ * returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the detect function
+ *
+ * @param model The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
+
+/**
+ * @brief Set the detection threshold to manually abjust the probability
+ *
+ * @param model The model object to query
+ * @param det_treshold The threshold to trigger wake words, the range of
+ * det_threshold is 0.5~0.9999
+ * @return 0: setting failed, 1: setting success
+ */
+typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
+
+/**
+ * @brief Get the voice activity detection threshold
+ *
+ * @param model The model object to query
+ * @returns the detection threshold
+ */
+typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model);
+
+/**
+ * @brief Feed samples of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param samples An array of 16-bit signed audio samples. The array size used
+ * can be queried by the get_samp_chunksize function.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
+
+/**
+ * @brief Get the triggered channel index. Channel index starts from zero
+ *
+ * @param model The model object to query
+ * @return The channel index
+ */
+typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
+
+/**
+ * @brief Clean all states of model
+ *
+ * @param model The model object to query
+ */
+typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model);
+
+/**
+ * @brief Destroy a model object
+ *
+ * @param model Model object to destroy
+ */
+typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model);
+
+/**
+ * This structure contains the functions used to do operations on a voice
+ * activity detection model.
+ */
+typedef struct {
+    esp_vadn_iface_op_create_t create;
+    esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_vadn_iface_op_get_channel_num_t get_channel_num;
+    esp_vadn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_vadn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
+    esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
+    esp_vadn_iface_op_detect_t detect;
+    esp_vadn_iface_op_clean_t clean;
+    esp_vadn_iface_op_destroy_t destroy;
+} esp_vadn_iface_t;
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32p4/esp_vadn_models.h
+++ b/include/esp32p4/esp_vadn_models.h
@ -0,0 +1,22 @@
+#pragma once
+#include "esp_vadn_iface.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The prefix of vadnet model name is used to filter all wakenet from availabel models.
+#define ESP_VADNET_PREFIX "vadnet"
+
+/**
+ * @brief Get the wakenet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of wakenet
+ */
+const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name);
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/lib/esp32p4/libdl_lib.a
+++ b/lib/esp32p4/libdl_lib.a
--- a/lib/esp32p4/libesp_audio_front_end.a
+++ b/lib/esp32p4/libesp_audio_front_end.a
--- a/lib/esp32p4/libesp_audio_processor.a
+++ b/lib/esp32p4/libesp_audio_processor.a
--- a/lib/esp32p4/libmultinet.a
+++ b/lib/esp32p4/libmultinet.a
--- a/lib/esp32p4/libvadnet.a
+++ b/lib/esp32p4/libvadnet.a
--- a/lib/esp32p4/libwakenet.a
+++ b/lib/esp32p4/libwakenet.a
--- a/lib/esp32s3/libesp_audio_front_end.a
+++ b/lib/esp32s3/libesp_audio_front_end.a
--- a/lib/esp32s3/libmultinet.a
+++ b/lib/esp32s3/libmultinet.a
--- a/lib/esp32s3/libnsnet.a
+++ b/lib/esp32s3/libnsnet.a
--- a/lib/esp32s3/libvadnet.a
+++ b/lib/esp32s3/libvadnet.a
--- a/lib/esp32s3/libwakenet.a
+++ b/lib/esp32s3/libwakenet.a
--- a/model/vadnet_model/vadnet1_medium/_MODEL_INFO_
+++ b/model/vadnet_model/vadnet1_medium/_MODEL_INFO_
@ -1 +1 @@
-vadnet1_medium50k_Speech_5_0.849_0.573
+vadnet1_50k_Speech_3_0.5_0.1
--- a/model/vadnet_model/vadnet1_medium/vadn1_data
+++ b/model/vadnet_model/vadnet1_medium/vadn1_data
--- a/model/vadnet_model/vadnet1_medium/vadn1_index
+++ b/model/vadnet_model/vadnet1_medium/vadn1_index