Merge branch 'refactor/afe' into 'master'

refactor(esp32s3): update AFE interface See merge request speech-recognition-framework/esp-sr!131
2025-09-15 15:28:44 +08:00 · 2025-02-05 16:52:46 +08:00 · 2025-02-05 16:52:46 +08:00 · 3b549e2d91
commit 3b549e2d91
parent 6027c7c02b c67a3d844a
76 changed files with 2232 additions and 1513 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -315,4 +315,4 @@ push_to_github:
    - echo -e "Host github.com\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config
    - git remote remove github &>/dev/null || true
    - git remote add github git@github.com:espressif/esp-sr.git
-    - git push github "${CI_COMMIT_SHA}:refs/heads/${CI_COMMIT_REF_NAME}"
+    - git push github "${CI_COMMIT_SHA}:refs/heads/${CI_COMMIT_REF_NAME}"
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -1,13 +1,14 @@
-if(IDF_TARGET STREQUAL "esp32")
+if((${IDF_TARGET} STREQUAL "esp32s3") OR (${IDF_TARGET} STREQUAL "esp32p4") OR (${IDF_TARGET} STREQUAL "esp32"))
    set(include_dirs
-        src/include
-        esp-tts/esp_tts_chinese/include
-        include/esp32
+        "esp-tts/esp_tts_chinese/include"
+        "include/${IDF_TARGET}"
+        "src/include"
        )
+        
    set(srcs
-        src/model_path.c
-        src/esp_mn_speech_commands.c
-        src/esp_process_sdkconfig.c
+        "src/model_path.c"
+        "src/esp_mn_speech_commands.c"
+        "src/esp_process_sdkconfig.c"
        )

    set(requires
@ -20,244 +21,320 @@ if(IDF_TARGET STREQUAL "esp32")
    ENDIF (IDF_VERSION_MAJOR GREATER 4)

    idf_component_register(SRCS ${srcs}
-                       INCLUDE_DIRS ${include_dirs}
-                       REQUIRES ${requires}
-                       PRIV_REQUIRES spi_flash)
+                        INCLUDE_DIRS ${include_dirs}
+                        REQUIRES ${requires}
+                        PRIV_REQUIRES spi_flash)

-    target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32")
-    target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32")
-    add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME})
-
-    target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
-        multinet
-        dl_lib
-        c_speech_features
-        wakeword_model
-        multinet2_ch
-        esp_audio_processor
-        esp_audio_front_end
-        esp_tts_chinese
-        voice_set_xiaole
-        wakenet
-        "-Wl,--end-group")
-elseif(${IDF_TARGET} STREQUAL "esp32s3")
-    set(include_dirs
-        src/include
-        esp-tts/esp_tts_chinese/include
-        include/esp32s3
-        )
-    set(srcs
-        src/model_path.c
-        src/esp_mn_speech_commands.c
-        src/esp_process_sdkconfig.c
-        )
-
-    set(requires
-        json
-        spiffs
-        )
-
-    IF (IDF_VERSION_MAJOR GREATER 4)
-        list(APPEND requires esp_partition)
-    ENDIF (IDF_VERSION_MAJOR GREATER 4)
-
-    idf_component_register(SRCS ${srcs}
-                       INCLUDE_DIRS ${include_dirs}
-                       REQUIRES ${requires}
-                       PRIV_REQUIRES spi_flash)
-
-    target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3")
-    target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32s3")
-
-    add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(nsnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libnsnet.a" PRIV_REQUIRES ${COMPONENT_NAME})

+    target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}")
+    target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}")
+    add_prebuilt_library(dl_lib "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libdl_lib.a" PRIV_REQUIRES ${COMPONENT_NAME})
+    add_prebuilt_library(c_speech_features "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libc_speech_features.a" PRIV_REQUIRES ${COMPONENT_NAME})
+    add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME})
+    add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME})
+    add_prebuilt_library(esp_tts_chinese "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libesp_tts_chinese.a" PRIV_REQUIRES ${COMPONENT_NAME})
+    add_prebuilt_library(voice_set_xiaole "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libvoice_set_xiaole.a" PRIV_REQUIRES ${COMPONENT_NAME})
+    add_prebuilt_library(fst "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libfst.a" PRIV_REQUIRES ${COMPONENT_NAME})
+    add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME})
+    add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME})
+    add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME})
+    add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
+    add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
+    add_prebuilt_library(nsnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libnsnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
    idf_component_get_property(esp_dsp_lib espressif__esp-dsp COMPONENT_LIB)

-    target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
-        hufzip
+    set(sr_libs 
        dl_lib
-        fst
-        c_speech_features
        $<TARGET_FILE:${esp_dsp_lib}>
+        c_speech_features
        esp_audio_front_end
        esp_audio_processor
-        multinet
-        flite_g2p
        esp_tts_chinese
        voice_set_xiaole
+        fst
+        flite_g2p
+        hufzip
+        multinet
        nsnet
        vadnet
-        wakenet
-        "-Wl,--end-group")
+        wakenet)

-    set(MVMODEL_EXE ${COMPONENT_PATH}/model/movemodel.py)
-    idf_build_get_property(build_dir BUILD_DIR)
-    set(image_file ${build_dir}/srmodels/srmodels.bin)
-
-    add_custom_command(
-        OUTPUT ${image_file}
-        COMMENT "Move and Pack models..."
-        COMMAND python ${MVMODEL_EXE} -d1 ${SDKCONFIG} -d2 ${COMPONENT_PATH} -d3 ${build_dir}
-        DEPENDS ${SDKCONFIG}
-        VERBATIM)
-
-    add_custom_target(srmodels_bin ALL DEPENDS ${image_file})
-    add_dependencies(flash srmodels_bin)
-
-    partition_table_get_partition_info(size "--partition-name model" "size")
-    partition_table_get_partition_info(offset "--partition-name model" "offset")
-
-    if("${size}" AND "${offset}")
-        esptool_py_flash_to_partition(flash "model" "${image_file}")
-    else()
-        set(message "Failed to find model in partition table file"
-                    "Please add a line(Name=model, Size>recommended size in log) to the partition file.")
+    if(${IDF_TARGET} STREQUAL "esp32")
+        add_prebuilt_library(multinet2_ch "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libmultinet2_ch.a" PRIV_REQUIRES ${COMPONENT_NAME})
+        list(APPEND sr_libs multinet2_ch)
    endif()
-elseif(${IDF_TARGET} STREQUAL "esp32p4")
-    set(include_dirs
-        src/include
-        esp-tts/esp_tts_chinese/include
-        include/esp32p4
-        )
-    set(srcs
-        src/model_path.c
-        src/esp_mn_speech_commands.c
-        src/esp_process_sdkconfig.c
-        )
-
-    set(requires
-        json
-        spiffs
-        )
-
-    IF (IDF_VERSION_MAJOR GREATER 4)
-        list(APPEND requires esp_partition)
-    ENDIF (IDF_VERSION_MAJOR GREATER 4)
-
-    idf_component_register(SRCS ${srcs}
-                       INCLUDE_DIRS ${include_dirs}
-                       REQUIRES ${requires}
-                       PRIV_REQUIRES spi_flash)
-
-    target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4")
-    target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32p4")
-
-    add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME})
-    add_prebuilt_library(nsnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libnsnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
-
-    idf_component_get_property(esp_dsp_lib espressif__esp-dsp COMPONENT_LIB)

    target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
-        hufzip
-        dl_lib
-        fst
-        c_speech_features
-        $<TARGET_FILE:${esp_dsp_lib}>
-        esp_audio_front_end
-        esp_audio_processor
-        multinet
-        flite_g2p
-        esp_tts_chinese
-        voice_set_xiaole
-        wakenet
-        vadnet
-        nsnet
+        ${sr_libs}
        "-Wl,--end-group")

-    set(MVMODEL_EXE ${COMPONENT_PATH}/model/movemodel.py)
-    idf_build_get_property(build_dir BUILD_DIR)
-    set(image_file ${build_dir}/srmodels/srmodels.bin)

-    add_custom_command(
-        OUTPUT ${image_file}
-        COMMENT "Move and Pack models..."
-        COMMAND python ${MVMODEL_EXE} -d1 ${SDKCONFIG} -d2 ${COMPONENT_PATH} -d3 ${build_dir}
-        DEPENDS ${SDKCONFIG}
-        VERBATIM)
+    if(CONFIG_IDF_TARGET_ESP32S3 OR CONFIG_IDF_TARGET_ESP32P4)
+        set(MVMODEL_EXE ${COMPONENT_PATH}/model/movemodel.py)
+        idf_build_get_property(build_dir BUILD_DIR)
+        set(image_file ${build_dir}/srmodels/srmodels.bin)

-    add_custom_target(srmodels_bin ALL DEPENDS ${image_file})
-    add_dependencies(flash srmodels_bin)
+        add_custom_command(
+            OUTPUT ${image_file}
+            COMMENT "Move and Pack models..."
+            COMMAND python ${MVMODEL_EXE} -d1 ${SDKCONFIG} -d2 ${COMPONENT_PATH} -d3 ${build_dir}
+            DEPENDS ${SDKCONFIG}
+            VERBATIM)

-    partition_table_get_partition_info(size "--partition-name model" "size")
-    partition_table_get_partition_info(offset "--partition-name model" "offset")
+        add_custom_target(srmodels_bin ALL DEPENDS ${image_file})
+        add_dependencies(flash srmodels_bin)

-    if("${size}" AND "${offset}")
-        esptool_py_flash_to_partition(flash "model" "${image_file}")
-    else()
-        set(message "Failed to find model in partition table file"
-                    "Please add a line(Name=model, Size>recommended size in log) to the partition file.")
+        partition_table_get_partition_info(size "--partition-name model" "size")
+        partition_table_get_partition_info(offset "--partition-name model" "offset")
+
+        if("${size}" AND "${offset}")
+            esptool_py_flash_to_partition(flash "model" "${image_file}")
+        else()
+            set(message "Failed to find model in partition table file"
+                        "Please add a line(Name=model, Size>recommended size in log) to the partition file.")
+        endif()
    endif()
-elseif(${IDF_TARGET} STREQUAL "esp32s2")
-    set(requires
+
+elseif((${IDF_TARGET} STREQUAL "esp32s2") OR (${IDF_TARGET} STREQUAL "esp32c3") OR (${IDF_TARGET} STREQUAL "esp32c6"))
+
+set(requires
    spiffs
    )

-    IF (IDF_VERSION_MAJOR GREATER 4)
-        list(APPEND requires esp_partition)
-    ENDIF (IDF_VERSION_MAJOR GREATER 4)
+IF (IDF_VERSION_MAJOR GREATER 4)
+    list(APPEND requires esp_partition)
+ENDIF (IDF_VERSION_MAJOR GREATER 4)

-    idf_component_register(SRCS .
-                       INCLUDE_DIRS  esp-tts/esp_tts_chinese/include
-                       REQUIRES ${requires}
-                       PRIV_REQUIRES spi_flash)
+idf_component_register(SRCS .
+                INCLUDE_DIRS  esp-tts/esp_tts_chinese/include
+                REQUIRES ${requires}
+                PRIV_REQUIRES spi_flash)

-    target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32s2")
-    target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group"
-        esp_tts_chinese
-        voice_set_xiaole
-        "-Wl,--end-group")
-elseif(${IDF_TARGET} STREQUAL "esp32c3")
-    set(requires
-    spiffs
-    )
+target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}")
+add_prebuilt_library(esp_tts_chinese "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libesp_tts_chinese.a" PRIV_REQUIRES ${COMPONENT_NAME})
+add_prebuilt_library(voice_set_xiaole "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libvoice_set_xiaole.a" PRIV_REQUIRES ${COMPONENT_NAME})
+target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group"
+    esp_tts_chinese
+    voice_set_xiaole
+    "-Wl,--end-group")

-    IF (IDF_VERSION_MAJOR GREATER 4)
-        list(APPEND requires esp_partition)
-    ENDIF (IDF_VERSION_MAJOR GREATER 4)
-
-    idf_component_register(SRCS .
-                   INCLUDE_DIRS  esp-tts/esp_tts_chinese/include
-                   REQUIRES ${requires}
-                   PRIV_REQUIRES spi_flash)
-
-    target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32c3")
-    target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group"
-        esp_tts_chinese
-        voice_set_xiaole
-        "-Wl,--end-group")
-elseif(${IDF_TARGET} STREQUAL "esp32c6")
-    set(requires
-    spiffs
-    )
-
-    IF (IDF_VERSION_MAJOR GREATER 4)
-        list(APPEND requires esp_partition)
-    ENDIF (IDF_VERSION_MAJOR GREATER 4)
-
-    idf_component_register(SRCS .
-                   INCLUDE_DIRS  esp-tts/esp_tts_chinese/include
-                   REQUIRES ${requires}
-                   PRIV_REQUIRES spi_flash)
-
-    target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32c6")
-    target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group"
-        esp_tts_chinese
-        voice_set_xiaole
-        "-Wl,--end-group")
 endif()
+
+# elseif(${IDF_TARGET} STREQUAL "esp32s3")
+#     set(include_dirs
+#         src/include
+#         esp-tts/esp_tts_chinese/include
+#         include/esp32s3
+#         )
+#     set(srcs
+#         src/model_path.c
+#         src/esp_mn_speech_commands.c
+#         src/esp_process_sdkconfig.c
+#         )
+
+#     set(requires
+#         json
+#         spiffs
+#         )
+
+#     IF (IDF_VERSION_MAJOR GREATER 4)
+#         list(APPEND requires esp_partition)
+#     ENDIF (IDF_VERSION_MAJOR GREATER 4)
+
+#     idf_component_register(SRCS ${srcs}
+#                        INCLUDE_DIRS ${include_dirs}
+#                        REQUIRES ${requires}
+#                        PRIV_REQUIRES spi_flash)
+
+#     target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3")
+#     target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32s3")
+
+#     add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME})
+#     add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME})
+#     add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
+#     add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
+#     add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME})
+#     add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME})
+#     add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME})
+#     add_prebuilt_library(nsnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libnsnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
+
+#     idf_component_get_property(esp_dsp_lib espressif__esp-dsp COMPONENT_LIB)
+
+#     target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
+#         hufzip
+#         dl_lib
+#         fst
+#         c_speech_features
+#         $<TARGET_FILE:${esp_dsp_lib}>
+#         esp_audio_front_end
+#         esp_audio_processor
+#         multinet
+#         flite_g2p
+#         esp_tts_chinese
+#         voice_set_xiaole
+#         nsnet
+#         vadnet
+#         wakenet
+#         "-Wl,--end-group")
+
+#     set(MVMODEL_EXE ${COMPONENT_PATH}/model/movemodel.py)
+#     idf_build_get_property(build_dir BUILD_DIR)
+#     set(image_file ${build_dir}/srmodels/srmodels.bin)
+
+#     add_custom_command(
+#         OUTPUT ${image_file}
+#         COMMENT "Move and Pack models..."
+#         COMMAND python ${MVMODEL_EXE} -d1 ${SDKCONFIG} -d2 ${COMPONENT_PATH} -d3 ${build_dir}
+#         DEPENDS ${SDKCONFIG}
+#         VERBATIM)
+
+#     add_custom_target(srmodels_bin ALL DEPENDS ${image_file})
+#     add_dependencies(flash srmodels_bin)
+
+#     partition_table_get_partition_info(size "--partition-name model" "size")
+#     partition_table_get_partition_info(offset "--partition-name model" "offset")
+
+#     if("${size}" AND "${offset}")
+#         esptool_py_flash_to_partition(flash "model" "${image_file}")
+#     else()
+#         set(message "Failed to find model in partition table file"
+#                     "Please add a line(Name=model, Size>recommended size in log) to the partition file.")
+#     endif()
+# elseif(${IDF_TARGET} STREQUAL "esp32p4")
+#     set(include_dirs
+#         src/include
+#         esp-tts/esp_tts_chinese/include
+#         include/esp32p4
+#         )
+#     set(srcs
+#         src/model_path.c
+#         src/esp_mn_speech_commands.c
+#         src/esp_process_sdkconfig.c
+#         )
+
+#     set(requires
+#         json
+#         spiffs
+#         )
+
+#     IF (IDF_VERSION_MAJOR GREATER 4)
+#         list(APPEND requires esp_partition)
+#     ENDIF (IDF_VERSION_MAJOR GREATER 4)
+
+#     idf_component_register(SRCS ${srcs}
+#                        INCLUDE_DIRS ${include_dirs}
+#                        REQUIRES ${requires}
+#                        PRIV_REQUIRES spi_flash)
+
+#     target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4")
+#     target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32p4")
+
+#     add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME})
+#     add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME})
+#     add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
+#     add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
+#     add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME})
+#     add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME})
+#     add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME})
+#     add_prebuilt_library(nsnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libnsnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
+
+#     idf_component_get_property(esp_dsp_lib espressif__esp-dsp COMPONENT_LIB)
+
+#     target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
+#         hufzip
+#         dl_lib
+#         fst
+#         c_speech_features
+#         $<TARGET_FILE:${esp_dsp_lib}>
+#         esp_audio_front_end
+#         esp_audio_processor
+#         multinet
+#         flite_g2p
+#         esp_tts_chinese
+#         voice_set_xiaole
+#         wakenet
+#         vadnet
+#         nsnet
+#         "-Wl,--end-group")
+
+#     set(MVMODEL_EXE ${COMPONENT_PATH}/model/movemodel.py)
+#     idf_build_get_property(build_dir BUILD_DIR)
+#     set(image_file ${build_dir}/srmodels/srmodels.bin)
+
+#     add_custom_command(
+#         OUTPUT ${image_file}
+#         COMMENT "Move and Pack models..."
+#         COMMAND python ${MVMODEL_EXE} -d1 ${SDKCONFIG} -d2 ${COMPONENT_PATH} -d3 ${build_dir}
+#         DEPENDS ${SDKCONFIG}
+#         VERBATIM)
+
+#     add_custom_target(srmodels_bin ALL DEPENDS ${image_file})
+#     add_dependencies(flash srmodels_bin)
+
+#     partition_table_get_partition_info(size "--partition-name model" "size")
+#     partition_table_get_partition_info(offset "--partition-name model" "offset")
+
+#     if("${size}" AND "${offset}")
+#         esptool_py_flash_to_partition(flash "model" "${image_file}")
+#     else()
+#         set(message "Failed to find model in partition table file"
+#                     "Please add a line(Name=model, Size>recommended size in log) to the partition file.")
+#     endif()
+# elseif(${IDF_TARGET} STREQUAL "esp32s2")
+#     set(requires
+#     spiffs
+#     )
+
+#     IF (IDF_VERSION_MAJOR GREATER 4)
+#         list(APPEND requires esp_partition)
+#     ENDIF (IDF_VERSION_MAJOR GREATER 4)
+
+#     idf_component_register(SRCS .
+#                        INCLUDE_DIRS  esp-tts/esp_tts_chinese/include
+#                        REQUIRES ${requires}
+#                        PRIV_REQUIRES spi_flash)
+
+#     target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32s2")
+#     target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group"
+#         esp_tts_chinese
+#         voice_set_xiaole
+#         "-Wl,--end-group")
+# elseif(${IDF_TARGET} STREQUAL "esp32c3")
+#     set(requires
+#     spiffs
+#     )
+
+#     IF (IDF_VERSION_MAJOR GREATER 4)
+#         list(APPEND requires esp_partition)
+#     ENDIF (IDF_VERSION_MAJOR GREATER 4)
+
+#     idf_component_register(SRCS .
+#                    INCLUDE_DIRS  esp-tts/esp_tts_chinese/include
+#                    REQUIRES ${requires}
+#                    PRIV_REQUIRES spi_flash)
+
+#     target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32c3")
+#     target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group"
+#         esp_tts_chinese
+#         voice_set_xiaole
+#         "-Wl,--end-group")
+# elseif(${IDF_TARGET} STREQUAL "esp32c6")
+#     set(requires
+#     spiffs
+#     )
+
+#     IF (IDF_VERSION_MAJOR GREATER 4)
+#         list(APPEND requires esp_partition)
+#     ENDIF (IDF_VERSION_MAJOR GREATER 4)
+
+#     idf_component_register(SRCS .
+#                    INCLUDE_DIRS  esp-tts/esp_tts_chinese/include
+#                    REQUIRES ${requires}
+#                    PRIV_REQUIRES spi_flash)
+
+#     target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32c6")
+#     target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group"
+#         esp_tts_chinese
+#         voice_set_xiaole
+#         "-Wl,--end-group")
+# endif()
--- a/docs/en/benchmark/README.rst
+++ b/docs/en/benchmark/README.rst
@ -45,6 +45,23 @@ Resource Consumption
    | AFE Layer       | 227 KB          |                 |                 |
    +-----------------+-----------------+-----------------+-----------------+

+ 
+    +--------------+------+-----------+---------------+------------+----------------+-----------------+
+    | Input Format | Type | Mode      | Internal RAM  | PSRAM      | Feed Task CPU  | Fetch Task CPU  |
+    +==============+======+===========+===============+============+================+=================+
+    | MR           | SR   | LOW_COST  | 72348         | 732932     | 8.4%           | 14.9%           |
+    +--------------+------+-----------+---------------+------------+----------------+-----------------+
+    | MR           | SR   | HIGH_PERF | 78016         | 734980     | 9.4%           | 14.9%           |
+    +--------------+------+-----------+---------------+------------+----------------+-----------------+
+    | MR           | VC   | LOW_COST  | 50316         | 821564     | 60.0%          | 8.1%            |
+    +--------------+------+-----------+---------------+------------+----------------+-----------------+
+    | MR           | VC   | HIGH_PERF | 93668         | 824144     | 64.0%          | 8.2%            |
+    +--------------+------+-----------+---------------+------------+----------------+-----------------+
+    | MMR          | SR   | LOW_COST  | 76684         | 1175148    | 36.6%          | 30.2%           |
+    +--------------+------+-----------+---------------+------------+----------------+-----------------+
+    | MMR          | SR   | HIGH_PERF | 99064         | 1174960    | 38.8%          | 30.0%           |
+    +--------------+------+-----------+---------------+------------+----------------+-----------------+
+
 .. only:: esp32p4

    +-----------------+-----------------+-----------------+-----------------+
@ -52,21 +69,33 @@ Resource Consumption
    |                 |                 | loading(compute |                 |
    |                 |                 | with 2 cores)   |                 |
    +=================+=================+=================+=================+
-    | AEC(LOW_COST)   | 152.3 KB        | 8%              | 32 ms           |
+    | AEC(LOW_COST)   | 152.3 KB        | 6%              | 32 ms           |
    +-----------------+-----------------+-----------------+-----------------+
-    | AEC(HIGH_PERF)  | 166 KB          | 11%             | 32 ms           |
+    | BSS(LOW_COST)   | 198.7 KB        | 3%              | 64 ms           |
    +-----------------+-----------------+-----------------+-----------------+
-    | BSS(LOW_COST)   | 198.7 KB        | 6%              | 64 ms           |
-    +-----------------+-----------------+-----------------+-----------------+
-    | BSS(HIGH_PERF)  | 215.5 KB        | 7%              | 64 ms           |
-    +-----------------+-----------------+-----------------+-----------------+
-    | NS              | 27 KB           | 5%              | 10 ms           |
+    | NS              | 27 KB           | 3%              | 10 ms           |
    +-----------------+-----------------+-----------------+-----------------+
    | MISO            | 56 KB           | 8%              | 16 ms           |
    +-----------------+-----------------+-----------------+-----------------+
    | AFE Layer       | 227 KB          |                 |                 |
    +-----------------+-----------------+-----------------+-----------------+

+    +--------------+------+-----------+---------------+------------+-----------------+-----------------+
+    | Input Format | Type | Mode      | Internal RAM  | PSRAM      | Feed Task CPU   | Fetch Task CPU  |
+    +==============+======+===========+===============+============+=================+=================+
+    | MR           | SR   | LOW_COST  | 75404         | 751292     | 10.6%           | 11.3%           |
+    +--------------+------+-----------+---------------+------------+-----------------+-----------------+
+    | MR           | SR   | HIGH_PERF | 75128         | 751292     | 10.6%           | 11.3%           |
+    +--------------+------+-----------+---------------+------------+-----------------+-----------------+
+    | MR           | VC   | LOW_COST  | 76192         | 841300     | 40.3%           | 5.7%            |
+    +--------------+------+-----------+---------------+------------+-----------------+-----------------+
+    | MR           | VC   | HIGH_PERF | 119536        | 843880     | 42.6%           | 5.7%            |
+    +--------------+------+-----------+---------------+------------+-----------------+-----------------+
+    | MMR          | SR   | LOW_COST  | 79940         | 1202692    | 28.4%           | 24.9%           |
+    +--------------+------+-----------+---------------+------------+-----------------+-----------------+
+    | MMR          | SR   | HIGH_PERF | 79940         | 1202692    | 28.4%           | 24.9%           |
+    +--------------+------+-----------+---------------+------------+-----------------+-----------------+
+
 WakeNet
 -------

--- a/docs/zh_CN/benchmark/README.rst
+++ b/docs/zh_CN/benchmark/README.rst
@ -49,6 +49,22 @@ AFE
    | AFE Layer       | 227 KB          |                 |                 |
    +-----------------+-----------------+-----------------+-----------------+

+    +--------------+------+-----------+---------------+------------+----------------+-----------------+
+    | Input Format | Type | Mode      | Internal RAM  | PSRAM      | Feed Task CPU  | Fetch Task CPU  |
+    +==============+======+===========+===============+============+================+=================+
+    | MR           | SR   | LOW_COST  | 72348         | 732932     | 8.4%           | 14.9%           |
+    +--------------+------+-----------+---------------+------------+----------------+-----------------+
+    | MR           | SR   | HIGH_PERF | 78016         | 734980     | 9.4%           | 14.9%           |
+    +--------------+------+-----------+---------------+------------+----------------+-----------------+
+    | MR           | VC   | LOW_COST  | 50316         | 821564     | 60.0%          | 8.1%            |
+    +--------------+------+-----------+---------------+------------+----------------+-----------------+
+    | MR           | VC   | HIGH_PERF | 93668         | 824144     | 64.0%          | 8.2%            |
+    +--------------+------+-----------+---------------+------------+----------------+-----------------+
+    | MMR          | SR   | LOW_COST  | 76684         | 1175148    | 36.6%          | 30.2%           |
+    +--------------+------+-----------+---------------+------------+----------------+-----------------+
+    | MMR          | SR   | HIGH_PERF | 99064         | 1174960    | 38.8%          | 30.0%           |
+    +--------------+------+-----------+---------------+------------+----------------+-----------------+
+
 .. only:: esp32p4

    +-----------------+-----------------+-----------------+-----------------+
@ -67,6 +83,22 @@ AFE
    | AFE Layer       | 227 KB          |                 |                 |
    +-----------------+-----------------+-----------------+-----------------+

+    +--------------+------+-----------+---------------+------------+-----------------+-----------------+
+    | Input Format | Type | Mode      | Internal RAM  | PSRAM      | Feed Task CPU   | Fetch Task CPU  |
+    +==============+======+===========+===============+============+=================+=================+
+    | MR           | SR   | LOW_COST  | 75404         | 751292     | 10.6%           | 11.3%           |
+    +--------------+------+-----------+---------------+------------+-----------------+-----------------+
+    | MR           | SR   | HIGH_PERF | 75128         | 751292     | 10.6%           | 11.3%           |
+    +--------------+------+-----------+---------------+------------+-----------------+-----------------+
+    | MR           | VC   | LOW_COST  | 76192         | 841300     | 40.3%           | 5.7%            |
+    +--------------+------+-----------+---------------+------------+-----------------+-----------------+
+    | MR           | VC   | HIGH_PERF | 119536        | 843880     | 42.6%           | 5.7%            |
+    +--------------+------+-----------+---------------+------------+-----------------+-----------------+
+    | MMR          | SR   | LOW_COST  | 79940         | 1202692    | 28.4%           | 24.9%           |
+    +--------------+------+-----------+---------------+------------+-----------------+-----------------+
+    | MMR          | SR   | HIGH_PERF | 79940         | 1202692    | 28.4%           | 24.9%           |
+    +--------------+------+-----------+---------------+------------+-----------------+-----------------+
+
 WakeNet
 -------

--- a/include/esp32/esp_aec.h
+++ b/include/esp32/esp_aec.h
@ -21,80 +21,72 @@ extern "C" {
 #endif

 #define USE_AEC_FFT                      // Not kiss_fft
-#define AEC_USE_SPIRAM      0
 #define AEC_SAMPLE_RATE     16000        // Only Support 16000Hz
-//#define AEC_FRAME_LENGTH_MS 16
 #define AEC_FRAME_LENGTH_MS 32
-#define AEC_FILTER_LENGTH   1200         // Number of samples of echo to cancel

-typedef void* aec_handle_t;
+typedef struct aec_handle_t aec_handle_t;
+typedef enum {
+    AEC_MODE_SR_LOW_COST = 0,     // Low Cost AEC fro speech recognition
+    AEC_MODE_SR_HIGH_PERF = 1,    // High Perforamce AEC for speech recognition
+    AEC_MODE_VOIP_LOW_COST = 3,   // Low Cost AEC for voice communication
+    AEC_MODE_VOIP_HIGH_PERF = 4,  // High Perforamce AEC for voice communication
+} aec_mode_t;

 /**
 * @brief Creates an instance to the AEC structure.
+ * Please get frame size by aec_get_chunksize() function
 * 
- * @deprecated This API will be deprecated after version 1.0, please use aec_pro_create
- *
 * @param sample_rate       The Sampling frequency (Hz) must be 16000.
- *
- * @param frame_length      The length of the audio processing must be 16ms.
- *
- * @param filter_length     Number of samples of echo to cancel.
- *
+ * @param filter_length     Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
+ * @param channel_num       The input microphone channel number
+ * @param mode              The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
 * @return
 *         - NULL: Create failed
 *         - Others: The instance of AEC
 */
-aec_handle_t aec_create(int sample_rate, int frame_length, int filter_length);
+aec_handle_t *aec_create(int sample_rate, int filter_length, int channel_num, aec_mode_t mode);

 /**
- * @brief Creates an instance to the AEC structure.
+ * @brief Creates an instance to the AEC structure, same with aec_create().
 * 
- * @deprecated This API will be deprecated after version 1.0, please use aec_pro_create
- *
- * @param sample_rate       The Sampling frequency (Hz) must be 16000.
- *
- * @param frame_length      The length of the audio processing must be 16ms.
- *
- * @param filter_length     Number of samples of echo to cancel.
- * 
- * @param nch               Number of input signal channel.
- *
+ * @param filter_length     Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
+ * @param channel_num       The input microphone channel number
+ * @param mode              The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
 * @return
 *         - NULL: Create failed
 *         - Others: The instance of AEC
 */
-aec_handle_t aec_create_multimic(int sample_rate, int frame_length, int filter_length, int nch);
-
-/**
- * @brief Creates an instance of more powerful AEC.
- *
- * @param frame_length      Length of input signal. Must be 16ms if mode is 0; otherwise could be 16ms or 32ms. Length of input signal to aec_process must be modified accordingly.
- *
- * @param nch               Number of microphones.
- *
- * @param mode              Mode of AEC (0 to 5), indicating aggressiveness and RAM allocation. 0: mild; 1 or 2: medium (1: internal RAM, 2: SPIRAM); 3 and 4: aggressive (3: internal RAM, 4: SPIRAM); 5: agressive, accelerated for ESP32-S3.
- *
- * @return
- *         - NULL: Create failed
- *         - Others: An Instance of AEC
- */
-aec_handle_t aec_pro_create(int frame_length, int nch, int mode);
+aec_handle_t *aec_pro_create(int filter_length, int channel_num, aec_mode_t mode);

 /**
 * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
 *
- * @param inst        The instance of AEC.
- *
+ * @warning The indata, refdata and outdata must be 16-bit signed. please allocate memory by heap_caps_aligned_alloc().
+ * 
+ * @param inst        The instance of AEC. Format for multi-channel data is "ch0 ch0 ch0 ..., ch1 ch1 ch1 ..."
 * @param indata      An array of 16-bit signed audio samples from mic.
- *
 * @param refdata     An array of 16-bit signed audio samples sent to the speaker.
- *
- * @param outdata     Returns near-end signal with echo removed.
- *
+ * @param outdata     Returns near-end signal with echo removed. Format for multi-channel data is "ch0 ch0 ch0..., ch1 ch1 ch1 ..."
 * @return None
 *
 */
-void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int16_t *outdata);
+void aec_process(const aec_handle_t *handel, int16_t *indata, int16_t *refdata, int16_t *outdata);
+
+/**
+ * @brief Get frame size of AEC (the samples of one frame)
+ * @param handle The instance of AEC.
+ * @return Frame size
+ */
+int aec_get_chunksize(const aec_handle_t *handle);
+
+/**
+ * @brief Get AEC mode string 
+ * 
+ * @param aec_mode  The mode of AEC.
+ * 
+ * @return AEC mode string
+ */
+char * aec_get_mode_string(aec_mode_t aec_mode);

 /**
 * @brief Free the AEC instance
@ -104,7 +96,7 @@ void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int
 * @return None
 *
 */
-void aec_destroy(aec_handle_t inst);
+void aec_destroy(aec_handle_t *handel);

 #ifdef __cplusplus
 }
--- a/include/esp32/esp_afe_config.h
+++ b/include/esp32/esp_afe_config.h
@ -1,24 +1,41 @@
 #pragma once
 #include "stdint.h"
+#include "stdbool.h"
+#include "stdlib.h"
 #include "esp_wn_iface.h"
 #include "esp_wn_models.h"
 #include "esp_vad.h"
-
+#include "esp_aec.h"
+#include "esp_agc.h"
+#include "model_path.h"
+#include "esp_vadn_models.h"
+#include "esp_nsn_models.h"
 #ifdef __cplusplus
 extern "C" {
 #endif

 //AFE: Audio Front-End 
 //SR:  Speech Recognition
-//afe_sr/AFE_SR: the audio front-end for speech recognition
-
+//VC:  Voice Communication

 //Set AFE_SR mode
 typedef enum {
-    SR_MODE_LOW_COST = 0,
-    SR_MODE_HIGH_PERF = 1
+    SR_MODE_LOW_COST = 0,     //Deprecated, please use afe_mode_t, AFE mode: low cost mode
+    SR_MODE_HIGH_PERF = 1,    //Deprecated, please use afe_mode_t, AFE mode: high performance mode
 } afe_sr_mode_t;

+//Set AFE mode
+typedef enum {
+    AFE_MODE_LOW_COST = 0,    // AFE mode: low cost mode
+    AFE_MODE_HIGH_PERF = 1,   // AFE mode: high performance mode
+} afe_mode_t;
+
+//Set AFE type
+typedef enum {
+    AFE_TYPE_SR = 0,   // Speech recognition scenarios, excluding nonlinear noise suppression
+    AFE_TYPE_VC = 1,   // Voice communication scenarios, including nonlinear noise suppression
+} afe_type_t;
+
 typedef enum {
    AFE_MEMORY_ALLOC_MORE_INTERNAL = 1,             // malloc with more internal ram
    AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2,    // malloc with internal ram and psram in balance
@ -26,24 +43,30 @@ typedef enum {
 } afe_memory_alloc_mode_t;

 typedef enum {
-    AFE_MN_PEAK_AGC_MODE_1 = -9,            // The peak amplitude of audio fed to multinet is -9dB
-    AFE_MN_PEAK_AGC_MODE_2 = -6,            // The peak amplitude of audio fed to multinet is -6dB
-    AFE_MN_PEAK_AGC_MODE_3 = -3,            // The peak amplitude of audio fed to multinet is -3dB
+    AFE_MN_PEAK_AGC_MODE_1 = -9,            // The peak amplitude of fetch audio is -9dB
+    AFE_MN_PEAK_AGC_MODE_2 = -6,            // The peak amplitude of fetch audio is -6dB
+    AFE_MN_PEAK_AGC_MODE_3 = -3,            // The peak amplitude of fetcg is -3dB
    AFE_MN_PEAK_NO_AGC = 0,                 // There is no agc gain
 } afe_mn_peak_agc_mode_t;

 typedef struct {
-    int total_ch_num;                       // total channel num. It must be: total_ch_num = mic_num + ref_num
-    int mic_num;                            // mic channel num
-    int ref_num;                            // reference channel num
-    int sample_rate;                        // sample rate of audio
+    int total_ch_num;                        // total channel num, include microphone channel, playback channel and unknown channel
+    int mic_num;                             // microphone channel number
+    uint8_t* mic_ids;                        // microphone channel indices
+    int ref_num;                             // playback reference channel number
+    uint8_t* ref_ids;                        // playback reference channel indices
+    int sample_rate;                         // sample rate of audio
 } afe_pcm_config_t;

 typedef enum {
-    NS_MODE_SSP = 0,                        // speech signal process method
-    NS_MODE_NET = 1,                        // deep noise suppression net method
+    AFE_NS_MODE_WEBRTC = 0,                        // please use model name of NS, SSP: "WEBRTC"
+    AFE_NS_MODE_NET = 1,                        // please use model name of NSNET
 } afe_ns_mode_t;

+typedef enum {
+    AFE_AGC_MODE_WEBRTC = 0,                        // WEBRTC AGC
+    AFE_AGC_MODE_WAKENET = 1,                       // AGC gain is calculated by wakenet model if wakenet is activated
+} afe_agc_mode_t;

 /**
 * @brief Function to get the debug audio data
@ -66,148 +89,192 @@ typedef struct {
 } afe_debug_hook_t;

 typedef struct {
-    bool aec_init;
-    bool se_init;
-    bool vad_init;
+    /********** AEC(Acoustic Echo Cancellation) **********/
+    bool aec_init;                          // Whether to init aec
+    aec_mode_t aec_mode;                    // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
+    int aec_filter_length;                  // The filter length of aec
+
+    /********** SE(Speech Enhancement, microphone array processing) **********/
+    bool se_init;                           // Whether to init se
+
+    /********** NS(Noise Suppression) **********/
+    bool ns_init;                           // Whether to init ns
+    char *ns_model_name;                    // Model name of ns
+    afe_ns_mode_t afe_ns_mode;              // Model mode of ns
+    
+    /********** VAD(Voice Activity Detection) **********/
+    bool vad_init;                           // Whether to init vad
+    vad_mode_t vad_mode;                     // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
+    char *vad_model_name;                    // The model name of vad, If it is null, WebRTC VAD will be used.
+    int vad_min_speech_ms;                   // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
+    int vad_min_noise_ms;                    // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms
+    bool vad_mute_playback;                  // If true, the playback will be muted for vad detection. default: false
+    bool vad_enable_channel_trigger;         // If true, the vad will be used to choose the channel id. default: false
+
+    /********** WakeNet(Wake Word Engine) **********/
    bool wakenet_init;
-    bool voice_communication_init;
-    bool voice_communication_agc_init;      // AGC swich for voice communication
-    int voice_communication_agc_gain;       // AGC gain(dB) for voice communication
-    vad_mode_t vad_mode;                    // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
    char *wakenet_model_name;               // The model name of wakenet 1
    char *wakenet_model_name_2;             // The model name of wakenet 2 if has wakenet 2
-    det_mode_t wakenet_mode;
-    afe_sr_mode_t afe_mode;
-    int afe_perferred_core;
-    int afe_perferred_priority;
-    int afe_ringbuf_size;
-    afe_memory_alloc_mode_t memory_alloc_mode;
-    float afe_linear_gain;                  // The linear gain for sr output(note: invaild for vc), the value should be in [0.1, 10.0]. 
-                                            // This value acts directly on the output amplitude: out_linear_gain * amplitude.
-    afe_mn_peak_agc_mode_t agc_mode;        // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
+    det_mode_t wakenet_mode;                // The mode of wakenet
+
+    /********** AGC(Automatic Gain Control) **********/
+    bool agc_init;                           // Whether to init agc
+    afe_agc_mode_t agc_mode;                     // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
+    int agc_compression_gain_db;             // Compression gain in dB (default 9)
+    int agc_target_level_dbfs;               // Target level in -dBfs of envelope (default -3)
+
+    /********** General AFE(Audio Front End) parameter **********/
    afe_pcm_config_t pcm_config;            // Config the channel num of original data which is fed to the afe feed function.
+    afe_mode_t afe_mode;                    // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    afe_type_t afe_type;                    // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    int afe_perferred_core;                 // The preferred core of afe se task, which is created in afe_create function.
+    int afe_perferred_priority;             // The preferred priority of afe se task, which is created in afe_create function.
+    int afe_ringbuf_size;                   // The ring buffer size: the number of frame data in ring buffer.
+    afe_memory_alloc_mode_t memory_alloc_mode;  // The memory alloc mode for afe. From Internal RAM or PSRAM
+    float afe_linear_gain;                  // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude.
    bool debug_init;
-    afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX];
-    afe_ns_mode_t afe_ns_mode;
-    char *afe_ns_model_name;
    bool fixed_first_channel;                // If true, the channel after first wake-up is fixed to raw data of microphone
                                             // otherwise, select channel number by wakenet
-    char *vad_model_name;                    // The model name of vad, support vadnet1 and vadnet1_small
-    int vad_min_speech_ms;                   // The minimum duration of speech in ms. It should be bigger than 32 ms
-    int vad_min_noise_ms;                    // The minimum duration of noise/silence in ms. It should be bigger than 64 ms
-    bool vad_mute_playback;                  // If true, the playback will be muted for vad detection
 } afe_config_t;

+/**
+ * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format. 
+ * You can manually fine-tune it after creating the configuration
+ * 
+ * The input format: 
+ * M to represent the microphone channel
+ * R to represent the playback reference channel
+ * N to represent an unknown or unused channel
+ * 
+ * For example, input_format="MMNR" indicates that the input data consists of four channels, 
+ * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
+ * 
+ * @param input_format     The input format
+ * @param models           Models from partition, which is configured by Kconfig
+ * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
+ * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+ * 
+ * @return afe_config_t*  The default config of afe
+ */
+afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);

-#if CONFIG_IDF_TARGET_ESP32
-#define AFE_CONFIG_DEFAULT() { \
-    .aec_init = true, \
-    .se_init = true, \
-    .vad_init = true, \
-    .wakenet_init = true, \
-    .voice_communication_init = false, \
-    .voice_communication_agc_init = false, \
-    .voice_communication_agc_gain = 15, \
-    .vad_mode = VAD_MODE_3, \
-    .wakenet_model_name = NULL, \
-    .wakenet_model_name_2 = NULL, \
-    .wakenet_mode = DET_MODE_90, \
-    .afe_mode = SR_MODE_HIGH_PERF, \
-    .afe_perferred_core = 0, \
-    .afe_perferred_priority = 5, \
-    .afe_ringbuf_size = 50, \
-    .memory_alloc_mode = AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE, \
-    .afe_linear_gain = 1.0, \
-    .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
-    .pcm_config = { \
-        .total_ch_num = 2, \
-        .mic_num = 1, \
-        .ref_num = 1, \
-        .sample_rate = 16000, \
-    }, \
-    .debug_init = false, \
-    .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
-    .afe_ns_mode = NS_MODE_SSP, \
-    .afe_ns_model_name = NULL, \
-    .fixed_first_channel = true, \
-    .vad_model_name = NULL, \
-    .vad_min_speech_ms = 64, \
-    .vad_min_noise_ms = 256, \
-    .vad_mute_playback = false, \
-}
-#elif CONFIG_IDF_TARGET_ESP32P4
-#define AFE_CONFIG_DEFAULT() { \
-    .aec_init = true, \
-    .se_init = true, \
-    .vad_init = true, \
-    .wakenet_init = true, \
-    .voice_communication_init = false, \
-    .voice_communication_agc_init = false, \
-    .voice_communication_agc_gain = 15, \
-    .vad_mode = VAD_MODE_3, \
-    .wakenet_model_name = NULL, \
-    .wakenet_model_name_2 = NULL, \
-    .wakenet_mode = DET_MODE_90, \
-    .afe_mode = SR_MODE_LOW_COST, \
-    .afe_perferred_core = 0, \
-    .afe_perferred_priority = 5, \
-    .afe_ringbuf_size = 50, \
-    .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
-    .afe_linear_gain = 1.0, \
-    .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
-    .pcm_config = { \
-        .total_ch_num = 2, \
-        .mic_num = 1, \
-        .ref_num = 1, \
-        .sample_rate = 16000, \
-    }, \
-    .debug_init = false, \
-    .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
-    .afe_ns_mode = NS_MODE_SSP, \
-    .afe_ns_model_name = NULL, \
-    .fixed_first_channel = true, \
-    .vad_model_name = NULL, \
-    .vad_min_speech_ms = 64, \
-    .vad_min_noise_ms = 256, \
-    .vad_mute_playback = false, \
-}
-#elif CONFIG_IDF_TARGET_ESP32S3
-#define AFE_CONFIG_DEFAULT() { \
-    .aec_init = true, \
-    .se_init = true, \
-    .vad_init = true, \
-    .wakenet_init = true, \
-    .voice_communication_init = false, \
-    .voice_communication_agc_init = false, \
-    .voice_communication_agc_gain = 15, \
-    .vad_mode = VAD_MODE_3, \
-    .wakenet_model_name = NULL, \
-    .wakenet_model_name_2 = NULL, \
-    .wakenet_mode = DET_MODE_2CH_90, \
-    .afe_mode = SR_MODE_LOW_COST, \
-    .afe_perferred_core = 0, \
-    .afe_perferred_priority = 5, \
-    .afe_ringbuf_size = 50, \
-    .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
-    .afe_linear_gain = 1.0, \
-    .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
-    .pcm_config = { \
-        .total_ch_num = 3, \
-        .mic_num = 2, \
-        .ref_num = 1, \
-        .sample_rate = 16000, \
-    }, \
-    .debug_init = false, \
-    .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
-    .afe_ns_mode = NS_MODE_SSP, \
-    .afe_ns_model_name = NULL, \
-    .fixed_first_channel = true, \
-    .vad_model_name = NULL, \
-    .vad_min_speech_ms = 64, \
-    .vad_min_noise_ms = 256, \
-    .vad_mute_playback = false, \
-}
-#endif
+/**
+ * @brief Check AFE configuration and make sure it is correct.
+ * 
+ * @warning If there is a configuration conflict, this function will modify some parameters. 
+ * The guiding behind these modifications is to maintain the highest performance of the output audio and results.
+ * And remove the conflict between different algorithms.
+ * 
+ * For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
+ * If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
+ * 
+ * @param afe_config       Input AFE config
+ * 
+ * @return afe_config_t*  The modified AFE config
+ */
+afe_config_t *afe_config_check(afe_config_t *afe_config);
+
+/**
+ * @brief Parse input format
+ * 
+ * @param input_format The input format, same with afe_config_init() function
+ * @param pcm_config   The pcm config
+ * 
+ * @return true if the input format is parsed successfully, otherwise false
+ */
+bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config);
+
+/**
+ * @brief Parse I2S input data
+ * 
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param mic_data     The output microphone data
+ * @param ref_data     The output playback reference data
+ * @param pcm_config   The pcm config
+ * 
+ */
+void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config);
+
+/**
+ * @brief Parse input data, from interleaved arrangement to contiguous arrangement
+ * 
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param channel_num  The channel number of data
+ * @param out_data     The output data
+ * 
+ */
+void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
+
+/**
+ * @brief Format input data, from contiguous arrangement to interleaved arrangement
+ * 
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param channel_num  The channel number of data
+ * @param out_data     The output data
+ * 
+ */
+void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
+
+/**
+ * @brief Adjust the gain of input data
+ * 
+ * @warning the input data will be modified inplace.
+ * 
+ * @param data         The input audio data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param factor       The gain factor
+ * 
+ * @return int16_t*    The output audio data
+ */
+int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
+
+/**
+ * @brief Adjust the gain of input data
+ * 
+ * @warning the input data will be modified inplace.
+ * 
+ * @param in_data         The input audio data
+ * @param in_frame_size   Input data frame size of input
+ * @param channel_num     The channel number of input data, which is same as output data
+ * @param out_data        The output audio data
+ * @param out_frame_size  Onput data frame size of input
+ *
+ */
+void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size);
+
+/**
+ * @brief Copy the afe config
+ * 
+ * @param dst_config    The destination afe config
+ * @param src_config    The source afe config
+ * 
+ * @return   The destination afe config
+ */
+afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
+
+/**
+ * @brief Print the afe config
+ * 
+ * @param afe_config    The afe config
+ */
+void afe_config_print(const afe_config_t *afe_config);
+
+/**
+ * @brief Allocate afe config
+ * 
+ * @return The afe config pointer
+ */
+afe_config_t *afe_config_alloc();
+
+/**
+ * @brief Free afe config
+ * 
+ * @param afe_config  The afe config pointer
+ */
+void afe_config_free(afe_config_t *afe_config);

 #ifdef __cplusplus
 }
--- a/include/esp32/esp_afe_sr_iface.h
+++ b/include/esp32/esp_afe_sr_iface.h
@ -1,7 +1,10 @@
 #pragma once
 #include "stdint.h"
+#include "stdlib.h"
+#include "stdbool.h"
 #include "esp_afe_config.h"
-
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -13,13 +16,15 @@ extern "C" {
 //Opaque AFE_SR data container
 typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;

+
+
 /**
 * @brief The state of vad
 */
 typedef enum
 {
-    AFE_VAD_SILENCE = 0,                    // noise or silence
-    AFE_VAD_SPEECH                          // speech
+    AFE_VAD_SILENCE = 0,                    // Deprecated, please use vad_state_t, noise or silence
+    AFE_VAD_SPEECH = 1                      // Deprecated, please use vad_state_t, speech
 } afe_vad_state_t;

 /**
@ -27,7 +32,7 @@ typedef enum
 */
 typedef struct afe_fetch_result_t
 {
-    int16_t *data;                          // the data of audio.
+    int16_t *data;                          // the target channel data of audio.
    int data_size;                          // the size of data. The unit is byte.
    int16_t *vad_cache;                     // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
    int vad_cache_size;                     // the size of vad_cache. The unit is byte.
@ -36,10 +41,12 @@ typedef struct afe_fetch_result_t
    wakenet_state_t wakeup_state;           // the value is wakenet_state_t
    int wake_word_index;                    // if the wake word is detected. It will store the wake word index which start from 1.
    int wakenet_model_index;                // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
-    afe_vad_state_t vad_state;              // the value is afe_vad_state_t
+    vad_state_t vad_state;              // the value is afe_vad_state_t
    int trigger_channel_id;                 // the channel index of output
    int wake_word_length;                   // the length of wake word. The unit is the number of samples.
    int ret_value;                          // the return state of fetch function
+    int16_t *raw_data;                      // the multi-channel output data of audio.
+    int raw_data_channels;                  // the channel number of raw data
    void* reserved;                         // reserved for future use
 } afe_fetch_result_t;

@ -63,19 +70,11 @@ typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_confi
 typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);

 /**
- * @brief Get the total channel number which be config
+ * @brief Get the channel number
 * 
 * @param afe   The AFE_SR object to query
 * @return      The amount of total channels
 */
-typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe);
-
-/**
- * @brief Get the mic channel number which be config
- * 
- * @param afe   The AFE_SR object to query
- * @return      The amount of mic channels
- */
 typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);

 /**
@ -104,12 +103,24 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t*
 * @brief fetch enhanced samples of an audio stream from the AFE_SR
 *
 * @Warning  The output is single channel data, no matter how many channels the input is.
+ *           Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
 *
 * @param afe   The AFE_SR object to query
 * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
 */
 typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);

+/**
+ * @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
+ *
+ * @Warning  The output is single channel data, no matter how many channels the input is.
+ *
+ * @param afe            The AFE_SR object to query
+ * @param ticks_to_wait  The timeout value, in ticks, to wait for the fetch result.
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
+ */
+typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
+
 /**
 * @brief reset ringbuf of AFE.
 *
@ -129,52 +140,37 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
 typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name);

 /**
- * @brief Disable wakenet model.
+ * @brief Enable VAD algorithm.
 *
 * @param afe          The AFE_SR object to query
 * @return             -1: fail, 0: disabled, 1: enabled
 */
-typedef int (*esp_afe_sr_iface_op_disable_wakenet_t)(esp_afe_sr_data_t *afe);
+typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe);

 /**
- * @brief Enable wakenet model.
+ * @brief Disable one function/module/algorithm.
 *
 * @param afe          The AFE_SR object to query
 * @return             -1: fail, 0: disabled, 1: enabled
 */
-typedef int (*esp_afe_sr_iface_op_enable_wakenet_t)(esp_afe_sr_data_t *afe);
+typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe);

 /**
- * @brief Disable AEC algorithm.
+ * @brief Enable one function/module/algorithm.
 *
 * @param afe          The AFE_SR object to query
 * @return             -1: fail, 0: disabled, 1: enabled
 */
-typedef int (*esp_afe_sr_iface_op_disable_aec_t)(esp_afe_sr_data_t *afe);
+typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe);

 /**
- * @brief Enable AEC algorithm.
+ * @brief Print all functions/modules/algorithms pipeline.
+ *       The pipeline is the order of the functions/modules/algorithms.
+ *       The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output]
 *
 * @param afe          The AFE_SR object to query
- * @return             -1: fail, 0: disabled, 1: enabled
 */
-typedef int (*esp_afe_sr_iface_op_enable_aec_t)(esp_afe_sr_data_t *afe);
-
-/**
- * @brief Disable SE algorithm.
- *
- * @param afe          The AFE_SR object to query
- * @return             -1: fail, 0: disabled, 1: enabled
- */
-typedef int (*esp_afe_sr_iface_op_disable_se_t)(esp_afe_sr_data_t *afe);
-
-/**
- * @brief Enable SE algorithm.
- *
- * @param afe          The AFE_SR object to query
- * @return             -1: fail, 0: disabled, 1: enabled
- */
-typedef int (*esp_afe_sr_iface_op_enable_se_t)(esp_afe_sr_data_t *afe);
+typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);

 /**
 * @brief Destroy a AFE_SR instance
@ -191,22 +187,41 @@ typedef struct {
    esp_afe_sr_iface_op_create_from_config_t create_from_config;
    esp_afe_sr_iface_op_feed_t feed;
    esp_afe_sr_iface_op_fetch_t fetch;
+    esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay;
    esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
    esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
    esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
-    esp_afe_sr_iface_op_get_total_channel_num_t get_total_channel_num;
-    esp_afe_sr_iface_op_get_channel_num_t get_channel_num;
+    esp_afe_sr_iface_op_get_channel_num_t get_channel_num;        // same with get_feed_channel_num
+    esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
+    esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
    esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
    esp_afe_sr_iface_op_set_wakenet_t  set_wakenet; 
-    esp_afe_sr_iface_op_disable_wakenet_t disable_wakenet;
-    esp_afe_sr_iface_op_enable_wakenet_t enable_wakenet;
-    esp_afe_sr_iface_op_disable_aec_t disable_aec;
-    esp_afe_sr_iface_op_enable_aec_t enable_aec;
-    esp_afe_sr_iface_op_disable_se_t disable_se;
-    esp_afe_sr_iface_op_enable_se_t enable_se;
+    esp_afe_sr_iface_op_disable_func_t disable_wakenet;
+    esp_afe_sr_iface_op_enable_func_t enable_wakenet;
+    esp_afe_sr_iface_op_disable_func_t disable_aec;
+    esp_afe_sr_iface_op_enable_func_t enable_aec;
+    esp_afe_sr_iface_op_disable_func_t disable_se;
+    esp_afe_sr_iface_op_enable_func_t enable_se;
+    esp_afe_sr_iface_op_disable_func_t disable_vad;
+    esp_afe_sr_iface_op_enable_func_t enable_vad;
+    esp_afe_sr_iface_op_disable_func_t disable_ns;
+    esp_afe_sr_iface_op_enable_func_t enable_ns;
+    esp_afe_sr_iface_op_disable_func_t disable_agc;
+    esp_afe_sr_iface_op_enable_func_t enable_agc;
+    esp_afe_sr_iface_op_print_pipeline_t print_pipeline;
    esp_afe_sr_iface_op_destroy_t destroy;
 } esp_afe_sr_iface_t;

+
+// struct is used to store the AFE handle and data for the AFE task
+typedef struct 
+{
+    esp_afe_sr_data_t *afe_data;
+    esp_afe_sr_iface_t *afe_handle;
+    TaskHandle_t feed_task; 
+    TaskHandle_t fetch_task; 
+}afe_task_into_t;
+
 #ifdef __cplusplus
 }
 #endif
--- a/include/esp32/esp_afe_sr_models.h
+++ b/include/esp32/esp_afe_sr_models.h
@ -6,17 +6,7 @@ extern "C" {

 #include "esp_afe_sr_iface.h"

-
-#if CONFIG_AFE_INTERFACE_V1
-extern const esp_afe_sr_iface_t esp_afe_sr_v1;
-extern const esp_afe_sr_iface_t esp_afe_vc_v1;
-#define ESP_AFE_SR_HANDLE esp_afe_sr_v1
-#define ESP_AFE_VC_HANDLE esp_afe_vc_v1
-
-#else
-#error No valid afe selected.
-#endif
-
+esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);

 #ifdef __cplusplus
 }
--- a/include/esp32/esp_agc.h
+++ b/include/esp32/esp_agc.h
@ -26,8 +26,15 @@ typedef enum {
    ESP_AGC_FRAME_SIZE_ERROR = -3,   ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size
 } ESP_AGE_ERR;

+typedef enum {
+    AGC_MODE_SR = -1,      // Bypass WEBRTC AGC
+    AGC_MODE_0 = 0,        // Only saturation protection
+    AGC_MODE_1 = 1,        // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
+    AGC_MODE_2 = 2,        // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
+    AGC_MODE_3 = 3,        // Fixed Digital Gain [compressionGaindB (default 8 dB)]
+} agc_mode_t;

-void *esp_agc_open(int agc_mode, int sample_rate);
+void *esp_agc_open(agc_mode_t agc_mode, int sample_rate);
 void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs);
 int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate);
 void esp_agc_close(void *agc_handle);
--- a/include/esp32/esp_vad.h
+++ b/include/esp32/esp_vad.h
@ -78,6 +78,8 @@ vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
 typedef struct {
    vad_trigger_t *trigger;
    void *vad_inst;
+    int sample_rate;
+    int frame_size;
 }vad_handle_with_trigger_t;

 typedef vad_handle_with_trigger_t* vad_handle_t;
@ -100,31 +102,41 @@ vad_handle_t vad_create(vad_mode_t vad_mode);
 * @brief Creates an instance to the VAD structure.
 *
 * @param vad_mode          Sets the VAD operating mode.
- * @param min_speech_len    Minimum frame number of speech duration
- * @param min_noise_len     Minimum frame number of noise duration
+ * @param sample_rate       Sample rate in Hz
+ * @param one_frame_ms      Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30.
+ * @param min_speech_ms     Minimum speech duration, unit is ms
+ * @param min_noise_ms      Minimum noise duration, unit is ms
 * @return
 *         - NULL: Create failed
 *         - Others: The instance of VAD
 */
-vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len);
+vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len);

 /**
 * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
 *
- * @param inst      The instance of VAD.
- *
- * @param data      An array of 16-bit signed audio samples.
- *
+ * @param handle            The instance of VAD.
+ * @param data              An array of 16-bit signed audio samples.
 * @param sample_rate_hz    The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
- *
 * @param one_frame_ms      The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
- *
 * @return
 *         - VAD_SILENCE if no voice
 *         - VAD_SPEECH  if voice is detected
 *
 */
-vad_state_t vad_process(vad_handle_t inst, int16_t *data, int sample_rate_hz, int one_frame_ms);
+vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms);
+
+/**
+ * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
+ *
+ * @param handle            The instance of VAD.
+ * @param data              An array of 16-bit signed audio samples.
+ * @return
+ *         - VAD_SILENCE if no voice
+ *         - VAD_SPEECH  if voice is detected
+ *
+ */
+vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);

 /**
 * @brief Free the VAD instance
--- a/include/esp32/esp_vadn_iface.h
+++ b/include/esp32/esp_vadn_iface.h
@ -0,0 +1,164 @@
+#pragma once
+#include "esp_vad.h"
+#include "stdint.h"
+#include "dl_lib_convq_queue.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// Opaque model data container
+typedef struct model_iface_data_t model_iface_data_t;
+
+// /**
+//  * @brief The state of vad
+//  */
+// typedef enum {
+//     VAD_NOISE = -1,  // Noise
+//     VADNET_STATE_SILENCE = 0, // Silence
+//     VAD_SPEECH = 1   // Speech
+// } vad_state_t;
+
+/**
+ * @brief Easy function type to initialze a model instance with a detection mode
+ * and specified model name
+ *
+ * @param model_name  The specified model name
+ * @param mode        The voice activity detection mode
+ * @param channel_num The number of input audio channels
+ * @param min_speech_ms  The minimum duration of speech in ms to trigger vad
+ * speech
+ * @param min_noise_ms   The minimum duration of noise in ms to trigger vad
+ * noise
+ * @returns Handle to the model data
+ */
+typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)(
+    const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms);
+
+/**
+ * @brief Get the amount of samples that need to be passed to the detect
+ * function
+ *
+ * Every speech recognition model processes a certain number of samples at the
+ * same time. This function can be used to query that amount. Note that the
+ * returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the channel number of samples that need to be passed to the detect
+ * function
+ *
+ * Every speech recognition model processes a certain number of samples at the
+ * same time. This function can be used to query that amount. Note that the
+ * returned amount is in 16-bit samples, not in bytes.
+ *
+ * @param model The model object to query
+ * @return The amount of samples to feed the detect function
+ */
+typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the sample rate of the samples to feed to the detect function
+ *
+ * @param model The model object to query
+ * @return The sample rate, in hz
+ */
+typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
+
+/**
+ * @brief Set the detection threshold to manually abjust the probability
+ *
+ * @param model The model object to query
+ * @param det_treshold The threshold to trigger wake words, the range of
+ * det_threshold is 0.5~0.9999
+ * @return 0: setting failed, 1: setting success
+ */
+typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
+
+/**
+ * @brief Get the voice activity detection threshold
+ *
+ * @param model The model object to query
+ * @returns the detection threshold
+ */
+typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model);
+
+/**
+ * @brief Feed samples of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param samples An array of 16-bit signed audio samples. The array size used
+ * can be queried by the get_samp_chunksize function.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
+
+/**
+ * @brief Feed MFCC of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param cq An array of 16-bit MFCC.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq);
+
+/**
+ * @brief Get MFCC of an audio stream
+ *
+ * @param model The model object to query
+ * @return MFCC data
+ */
+typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
+
+/**
+ * @brief Get the triggered channel index. Channel index starts from zero
+ *
+ * @param model The model object to query
+ * @return The channel index
+ */
+typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
+
+/**
+ * @brief Clean all states of model
+ *
+ * @param model The model object to query
+ */
+typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model);
+
+/**
+ * @brief Destroy a model object
+ *
+ * @param model Model object to destroy
+ */
+typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model);
+
+/**
+ * This structure contains the functions used to do operations on a voice
+ * activity detection model.
+ */
+typedef struct {
+    esp_vadn_iface_op_create_t create;
+    esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize;
+    esp_vadn_iface_op_get_channel_num_t get_channel_num;
+    esp_vadn_iface_op_get_samp_rate_t get_samp_rate;
+    esp_vadn_iface_op_set_det_threshold_t set_det_threshold;
+    esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
+    esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
+    esp_vadn_iface_op_detect_t detect;
+    esp_vadn_iface_op_detect_mfcc_t detect_mfcc;
+    esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data;
+    esp_vadn_iface_op_clean_t clean;
+    esp_vadn_iface_op_destroy_t destroy;
+} esp_vadn_iface_t;
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32/esp_vadn_models.h
+++ b/include/esp32/esp_vadn_models.h
@ -0,0 +1,22 @@
+#pragma once
+#include "esp_vadn_iface.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// The prefix of vadnet model name is used to filter all wakenet from availabel models.
+#define ESP_VADN_PREFIX "vadnet"
+
+/**
+ * @brief Get the wakenet handle from model name
+ *
+ * @param model_name   The name of model 
+ * @returns The handle of wakenet
+ */
+const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name);
+
+
+#ifdef __cplusplus
+}
+#endif
--- a/include/esp32/esp_webrtc.h
+++ b/include/esp32/esp_webrtc.h
@ -0,0 +1,90 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_WEBRTC_H_
+#define _ESP_WEBRTC_H_
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <stdint.h>
+#include "sr_ringbuf.h"
+#include "esp_log.h"
+#include "esp_agc.h"
+#include "esp_ns.h"
+
+#include "esp_heap_caps.h"
+
+typedef struct {
+    void* ns_handle;
+    void* agc_handle;
+    int frame_size;
+    int sample_rate;
+    int16_t *buff;
+    int16_t *out_data;
+    sr_ringbuf_handle_t rb;
+}webrtc_handle_t;
+
+/**
+ * @brief Creates an instance of webrtc.
+ * 
+ * @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms.
+ *
+ * @param frame_length_ms    The length of the audio processing
+ * @param ns_mode            The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive
+ * @param agc_mode           The model of AGC
+ * @param agc_gain           The gain of AGC. default is 9
+ * @param agc_target_level   The target level of AGC. default is -3 dbfs
+ * @param sample_rate        The sample rate of the audio. 
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of webrtc
+ */
+webrtc_handle_t* webrtc_create(
+    int frame_length_ms, 
+    int ns_mode, 
+    agc_mode_t agc_mode, 
+    int agc_gain, 
+    int agc_target_level, 
+    int sample_rate);
+
+/**
+ * @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression.
+ *
+ * @param handle        The instance of NS.
+ * @param in_data       An array of 16-bit signed audio samples.
+ * @param out_size      The sample size of output data
+ * @param enable_ns     Enable noise suppression
+ * @param enable_agc    Enable automatic gain control
+ *
+ * @return data after noise suppression
+ */
+int16_t* webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc);
+
+/**
+ * @brief Free the webrtc instance
+ *
+ * @param handle The instance of webrtc.
+ *
+ * @return None
+ *
+ */
+void webrtc_destroy(webrtc_handle_t *handle);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_NS_H_
--- a/include/esp32/esp_wn_iface.h
+++ b/include/esp32/esp_wn_iface.h
@ -1,5 +1,6 @@
 #pragma once
 #include "stdint.h"
+#include "dl_lib_convq_queue.h"

 #ifdef __cplusplus
 extern "C" {
@ -167,6 +168,25 @@ typedef void (*esp_wn_iface_op_clean_t)(model_iface_data_t *model);
 */
 typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model);

+/**
+ * @brief Feed MFCC of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param cq An array of 16-bit MFCC.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef wakenet_state_t (*esp_wn_iface_op_detect_mfcc_t)(model_iface_data_t *model, int16_t *samples, dl_convq_queue_t *cq);
+
+/**
+ * @brief Get MFCC of an audio stream
+ *
+ * @param model The model object to query
+ * @return MFCC data
+ */
+typedef dl_convq_queue_t* (*esp_wn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
+

 /**
 * This structure contains the functions used to do operations on a wake word detection model.
@ -184,6 +204,8 @@ typedef struct {
    esp_wn_iface_op_get_triggered_channel_t  get_triggered_channel;
    esp_wn_iface_op_get_vol_gain_t get_vol_gain;
    esp_wn_iface_op_detect_t detect;
+    esp_wn_iface_op_detect_mfcc_t detect_mfcc;
+    esp_wn_iface_op_get_mfcc_data_t get_mfcc_data;
    esp_wn_iface_op_clean_t clean;
    esp_wn_iface_op_destroy_t destroy;
 } esp_wn_iface_t;
--- a/include/esp32p4/esp_aec.h
+++ b/include/esp32p4/esp_aec.h
@ -21,80 +21,72 @@ extern "C" {
 #endif

 #define USE_AEC_FFT                      // Not kiss_fft
-#define AEC_USE_SPIRAM      0
 #define AEC_SAMPLE_RATE     16000        // Only Support 16000Hz
-//#define AEC_FRAME_LENGTH_MS 16
 #define AEC_FRAME_LENGTH_MS 32
-#define AEC_FILTER_LENGTH   1200         // Number of samples of echo to cancel

-typedef void* aec_handle_t;
+typedef struct aec_handle_t aec_handle_t;
+typedef enum {
+    AEC_MODE_SR_LOW_COST = 0,     // Low Cost AEC fro speech recognition
+    AEC_MODE_SR_HIGH_PERF = 1,    // High Perforamce AEC for speech recognition
+    AEC_MODE_VOIP_LOW_COST = 3,   // Low Cost AEC for voice communication
+    AEC_MODE_VOIP_HIGH_PERF = 4,  // High Perforamce AEC for voice communication
+} aec_mode_t;

 /**
 * @brief Creates an instance to the AEC structure.
+ * Please get frame size by aec_get_chunksize() function
 * 
- * @deprecated This API will be deprecated after version 1.0, please use aec_pro_create
- *
 * @param sample_rate       The Sampling frequency (Hz) must be 16000.
- *
- * @param frame_length      The length of the audio processing must be 16ms.
- *
- * @param filter_length     Number of samples of echo to cancel.
- *
+ * @param filter_length     Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
+ * @param channel_num       The input microphone channel number
+ * @param mode              The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
 * @return
 *         - NULL: Create failed
 *         - Others: The instance of AEC
 */
-aec_handle_t aec_create(int sample_rate, int frame_length, int filter_length);
+aec_handle_t *aec_create(int sample_rate, int filter_length, int channel_num, aec_mode_t mode);

 /**
- * @brief Creates an instance to the AEC structure.
+ * @brief Creates an instance to the AEC structure, same with aec_create().
 * 
- * @deprecated This API will be deprecated after version 1.0, please use aec_pro_create
- *
- * @param sample_rate       The Sampling frequency (Hz) must be 16000.
- *
- * @param frame_length      The length of the audio processing must be 16ms.
- *
- * @param filter_length     Number of samples of echo to cancel.
- * 
- * @param nch               Number of input signal channel.
- *
+ * @param filter_length     Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
+ * @param channel_num       The input microphone channel number
+ * @param mode              The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
 * @return
 *         - NULL: Create failed
 *         - Others: The instance of AEC
 */
-aec_handle_t aec_create_multimic(int sample_rate, int frame_length, int filter_length, int nch);
-
-/**
- * @brief Creates an instance of more powerful AEC.
- *
- * @param frame_length      Length of input signal. Must be 16ms if mode is 0; otherwise could be 16ms or 32ms. Length of input signal to aec_process must be modified accordingly.
- *
- * @param nch               Number of microphones.
- *
- * @param mode              Mode of AEC (0 to 5), indicating aggressiveness and RAM allocation. 0: mild; 1 or 2: medium (1: internal RAM, 2: SPIRAM); 3 and 4: aggressive (3: internal RAM, 4: SPIRAM); 5: agressive, accelerated for ESP32-S3.
- *
- * @return
- *         - NULL: Create failed
- *         - Others: An Instance of AEC
- */
-aec_handle_t aec_pro_create(int frame_length, int nch, int mode);
+aec_handle_t *aec_pro_create(int filter_length, int channel_num, aec_mode_t mode);

 /**
 * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
 *
- * @param inst        The instance of AEC.
- *
+ * @warning The indata, refdata and outdata must be 16-bit signed. please allocate memory by heap_caps_aligned_alloc().
+ * 
+ * @param inst        The instance of AEC. Format for multi-channel data is "ch0 ch0 ch0 ..., ch1 ch1 ch1 ..."
 * @param indata      An array of 16-bit signed audio samples from mic.
- *
 * @param refdata     An array of 16-bit signed audio samples sent to the speaker.
- *
- * @param outdata     Returns near-end signal with echo removed.
- *
+ * @param outdata     Returns near-end signal with echo removed. Format for multi-channel data is "ch0 ch0 ch0..., ch1 ch1 ch1 ..."
 * @return None
 *
 */
-void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int16_t *outdata);
+void aec_process(const aec_handle_t *handel, int16_t *indata, int16_t *refdata, int16_t *outdata);
+
+/**
+ * @brief Get frame size of AEC (the samples of one frame)
+ * @param handle The instance of AEC.
+ * @return Frame size
+ */
+int aec_get_chunksize(const aec_handle_t *handle);
+
+/**
+ * @brief Get AEC mode string 
+ * 
+ * @param aec_mode  The mode of AEC.
+ * 
+ * @return AEC mode string
+ */
+char * aec_get_mode_string(aec_mode_t aec_mode);

 /**
 * @brief Free the AEC instance
@ -104,7 +96,7 @@ void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int
 * @return None
 *
 */
-void aec_destroy(aec_handle_t inst);
+void aec_destroy(aec_handle_t *handel);

 #ifdef __cplusplus
 }
--- a/include/esp32p4/esp_afe_config.h
+++ b/include/esp32p4/esp_afe_config.h
@ -1,24 +1,41 @@
 #pragma once
 #include "stdint.h"
+#include "stdbool.h"
+#include "stdlib.h"
 #include "esp_wn_iface.h"
 #include "esp_wn_models.h"
 #include "esp_vad.h"
-
+#include "esp_aec.h"
+#include "esp_agc.h"
+#include "model_path.h"
+#include "esp_vadn_models.h"
+#include "esp_nsn_models.h"
 #ifdef __cplusplus
 extern "C" {
 #endif

 //AFE: Audio Front-End 
 //SR:  Speech Recognition
-//afe_sr/AFE_SR: the audio front-end for speech recognition
-
+//VC:  Voice Communication

 //Set AFE_SR mode
 typedef enum {
-    SR_MODE_LOW_COST = 0,
-    SR_MODE_HIGH_PERF = 1
+    SR_MODE_LOW_COST = 0,     //Deprecated, please use afe_mode_t, AFE mode: low cost mode
+    SR_MODE_HIGH_PERF = 1,    //Deprecated, please use afe_mode_t, AFE mode: high performance mode
 } afe_sr_mode_t;

+//Set AFE mode
+typedef enum {
+    AFE_MODE_LOW_COST = 0,    // AFE mode: low cost mode
+    AFE_MODE_HIGH_PERF = 1,   // AFE mode: high performance mode
+} afe_mode_t;
+
+//Set AFE type
+typedef enum {
+    AFE_TYPE_SR = 0,   // Speech recognition scenarios, excluding nonlinear noise suppression
+    AFE_TYPE_VC = 1,   // Voice communication scenarios, including nonlinear noise suppression
+} afe_type_t;
+
 typedef enum {
    AFE_MEMORY_ALLOC_MORE_INTERNAL = 1,             // malloc with more internal ram
    AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2,    // malloc with internal ram and psram in balance
@ -26,24 +43,30 @@ typedef enum {
 } afe_memory_alloc_mode_t;

 typedef enum {
-    AFE_MN_PEAK_AGC_MODE_1 = -9,            // The peak amplitude of audio fed to multinet is -9dB
-    AFE_MN_PEAK_AGC_MODE_2 = -6,            // The peak amplitude of audio fed to multinet is -6dB
-    AFE_MN_PEAK_AGC_MODE_3 = -3,            // The peak amplitude of audio fed to multinet is -3dB
+    AFE_MN_PEAK_AGC_MODE_1 = -9,            // The peak amplitude of fetch audio is -9dB
+    AFE_MN_PEAK_AGC_MODE_2 = -6,            // The peak amplitude of fetch audio is -6dB
+    AFE_MN_PEAK_AGC_MODE_3 = -3,            // The peak amplitude of fetcg is -3dB
    AFE_MN_PEAK_NO_AGC = 0,                 // There is no agc gain
 } afe_mn_peak_agc_mode_t;

 typedef struct {
-    int total_ch_num;                       // total channel num. It must be: total_ch_num = mic_num + ref_num
-    int mic_num;                            // mic channel num
-    int ref_num;                            // reference channel num
-    int sample_rate;                        // sample rate of audio
+    int total_ch_num;                        // total channel num, include microphone channel, playback channel and unknown channel
+    int mic_num;                             // microphone channel number
+    uint8_t* mic_ids;                        // microphone channel indices
+    int ref_num;                             // playback reference channel number
+    uint8_t* ref_ids;                        // playback reference channel indices
+    int sample_rate;                         // sample rate of audio
 } afe_pcm_config_t;

 typedef enum {
-    NS_MODE_SSP = 0,                        // speech signal process method
-    NS_MODE_NET = 1,                        // deep noise suppression net method
+    AFE_NS_MODE_WEBRTC = 0,                        // please use model name of NS, SSP: "WEBRTC"
+    AFE_NS_MODE_NET = 1,                        // please use model name of NSNET
 } afe_ns_mode_t;

+typedef enum {
+    AFE_AGC_MODE_WEBRTC = 0,                        // WEBRTC AGC
+    AFE_AGC_MODE_WAKENET = 1,                       // AGC gain is calculated by wakenet model if wakenet is activated
+} afe_agc_mode_t;

 /**
 * @brief Function to get the debug audio data
@ -66,148 +89,192 @@ typedef struct {
 } afe_debug_hook_t;

 typedef struct {
-    bool aec_init;
-    bool se_init;
-    bool vad_init;
+    /********** AEC(Acoustic Echo Cancellation) **********/
+    bool aec_init;                          // Whether to init aec
+    aec_mode_t aec_mode;                    // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
+    int aec_filter_length;                  // The filter length of aec
+
+    /********** SE(Speech Enhancement, microphone array processing) **********/
+    bool se_init;                           // Whether to init se
+
+    /********** NS(Noise Suppression) **********/
+    bool ns_init;                           // Whether to init ns
+    char *ns_model_name;                    // Model name of ns
+    afe_ns_mode_t afe_ns_mode;              // Model mode of ns
+    
+    /********** VAD(Voice Activity Detection) **********/
+    bool vad_init;                           // Whether to init vad
+    vad_mode_t vad_mode;                     // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
+    char *vad_model_name;                    // The model name of vad, If it is null, WebRTC VAD will be used.
+    int vad_min_speech_ms;                   // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
+    int vad_min_noise_ms;                    // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms
+    bool vad_mute_playback;                  // If true, the playback will be muted for vad detection. default: false
+    bool vad_enable_channel_trigger;         // If true, the vad will be used to choose the channel id. default: false
+
+    /********** WakeNet(Wake Word Engine) **********/
    bool wakenet_init;
-    bool voice_communication_init;
-    bool voice_communication_agc_init;      // AGC swich for voice communication
-    int voice_communication_agc_gain;       // AGC gain(dB) for voice communication
-    vad_mode_t vad_mode;                    // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
    char *wakenet_model_name;               // The model name of wakenet 1
    char *wakenet_model_name_2;             // The model name of wakenet 2 if has wakenet 2
-    det_mode_t wakenet_mode;
-    afe_sr_mode_t afe_mode;
-    int afe_perferred_core;
-    int afe_perferred_priority;
-    int afe_ringbuf_size;
-    afe_memory_alloc_mode_t memory_alloc_mode;
-    float afe_linear_gain;                  // The linear gain for sr output(note: invaild for vc), the value should be in [0.1, 10.0]. 
-                                            // This value acts directly on the output amplitude: out_linear_gain * amplitude.
-    afe_mn_peak_agc_mode_t agc_mode;        // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
+    det_mode_t wakenet_mode;                // The mode of wakenet
+
+    /********** AGC(Automatic Gain Control) **********/
+    bool agc_init;                           // Whether to init agc
+    afe_agc_mode_t agc_mode;                     // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
+    int agc_compression_gain_db;             // Compression gain in dB (default 9)
+    int agc_target_level_dbfs;               // Target level in -dBfs of envelope (default -3)
+
+    /********** General AFE(Audio Front End) parameter **********/
    afe_pcm_config_t pcm_config;            // Config the channel num of original data which is fed to the afe feed function.
+    afe_mode_t afe_mode;                    // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    afe_type_t afe_type;                    // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    int afe_perferred_core;                 // The preferred core of afe se task, which is created in afe_create function.
+    int afe_perferred_priority;             // The preferred priority of afe se task, which is created in afe_create function.
+    int afe_ringbuf_size;                   // The ring buffer size: the number of frame data in ring buffer.
+    afe_memory_alloc_mode_t memory_alloc_mode;  // The memory alloc mode for afe. From Internal RAM or PSRAM
+    float afe_linear_gain;                  // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude.
    bool debug_init;
-    afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX];
-    afe_ns_mode_t afe_ns_mode;
-    char *afe_ns_model_name;
    bool fixed_first_channel;                // If true, the channel after first wake-up is fixed to raw data of microphone
                                             // otherwise, select channel number by wakenet
-    char *vad_model_name;                    // The model name of vad, support vadnet1 and vadnet1_small
-    int vad_min_speech_ms;                   // The minimum duration of speech in ms. It should be bigger than 32 ms
-    int vad_min_noise_ms;                    // The minimum duration of noise/silence in ms. It should be bigger than 64 ms
-    bool vad_mute_playback;                  // If true, the playback will be muted for vad detection
 } afe_config_t;

+/**
+ * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format. 
+ * You can manually fine-tune it after creating the configuration
+ * 
+ * The input format: 
+ * M to represent the microphone channel
+ * R to represent the playback reference channel
+ * N to represent an unknown or unused channel
+ * 
+ * For example, input_format="MMNR" indicates that the input data consists of four channels, 
+ * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
+ * 
+ * @param input_format     The input format
+ * @param models           Models from partition, which is configured by Kconfig
+ * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
+ * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+ * 
+ * @return afe_config_t*  The default config of afe
+ */
+afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);

-#if CONFIG_IDF_TARGET_ESP32
-#define AFE_CONFIG_DEFAULT() { \
-    .aec_init = true, \
-    .se_init = true, \
-    .vad_init = true, \
-    .wakenet_init = true, \
-    .voice_communication_init = false, \
-    .voice_communication_agc_init = false, \
-    .voice_communication_agc_gain = 15, \
-    .vad_mode = VAD_MODE_3, \
-    .wakenet_model_name = NULL, \
-    .wakenet_model_name_2 = NULL, \
-    .wakenet_mode = DET_MODE_90, \
-    .afe_mode = SR_MODE_HIGH_PERF, \
-    .afe_perferred_core = 0, \
-    .afe_perferred_priority = 5, \
-    .afe_ringbuf_size = 50, \
-    .memory_alloc_mode = AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE, \
-    .afe_linear_gain = 1.0, \
-    .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
-    .pcm_config = { \
-        .total_ch_num = 2, \
-        .mic_num = 1, \
-        .ref_num = 1, \
-        .sample_rate = 16000, \
-    }, \
-    .debug_init = false, \
-    .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
-    .afe_ns_mode = NS_MODE_SSP, \
-    .afe_ns_model_name = NULL, \
-    .fixed_first_channel = true, \
-    .vad_model_name = NULL, \
-    .vad_min_speech_ms = 64, \
-    .vad_min_noise_ms = 256, \
-    .vad_mute_playback = false, \
-}
-#elif CONFIG_IDF_TARGET_ESP32P4
-#define AFE_CONFIG_DEFAULT() { \
-    .aec_init = true, \
-    .se_init = true, \
-    .vad_init = true, \
-    .wakenet_init = true, \
-    .voice_communication_init = false, \
-    .voice_communication_agc_init = false, \
-    .voice_communication_agc_gain = 15, \
-    .vad_mode = VAD_MODE_3, \
-    .wakenet_model_name = NULL, \
-    .wakenet_model_name_2 = NULL, \
-    .wakenet_mode = DET_MODE_90, \
-    .afe_mode = SR_MODE_LOW_COST, \
-    .afe_perferred_core = 0, \
-    .afe_perferred_priority = 5, \
-    .afe_ringbuf_size = 50, \
-    .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
-    .afe_linear_gain = 1.0, \
-    .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
-    .pcm_config = { \
-        .total_ch_num = 2, \
-        .mic_num = 1, \
-        .ref_num = 1, \
-        .sample_rate = 16000, \
-    }, \
-    .debug_init = false, \
-    .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
-    .afe_ns_mode = NS_MODE_SSP, \
-    .afe_ns_model_name = NULL, \
-    .fixed_first_channel = true, \
-    .vad_model_name = NULL, \
-    .vad_min_speech_ms = 64, \
-    .vad_min_noise_ms = 256, \
-    .vad_mute_playback = false, \
-}
-#elif CONFIG_IDF_TARGET_ESP32S3
-#define AFE_CONFIG_DEFAULT() { \
-    .aec_init = true, \
-    .se_init = true, \
-    .vad_init = true, \
-    .wakenet_init = true, \
-    .voice_communication_init = false, \
-    .voice_communication_agc_init = false, \
-    .voice_communication_agc_gain = 15, \
-    .vad_mode = VAD_MODE_3, \
-    .wakenet_model_name = NULL, \
-    .wakenet_model_name_2 = NULL, \
-    .wakenet_mode = DET_MODE_2CH_90, \
-    .afe_mode = SR_MODE_LOW_COST, \
-    .afe_perferred_core = 0, \
-    .afe_perferred_priority = 5, \
-    .afe_ringbuf_size = 50, \
-    .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
-    .afe_linear_gain = 1.0, \
-    .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
-    .pcm_config = { \
-        .total_ch_num = 3, \
-        .mic_num = 2, \
-        .ref_num = 1, \
-        .sample_rate = 16000, \
-    }, \
-    .debug_init = false, \
-    .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
-    .afe_ns_mode = NS_MODE_SSP, \
-    .afe_ns_model_name = NULL, \
-    .fixed_first_channel = true, \
-    .vad_model_name = NULL, \
-    .vad_min_speech_ms = 64, \
-    .vad_min_noise_ms = 256, \
-    .vad_mute_playback = false, \
-}
-#endif
+/**
+ * @brief Check AFE configuration and make sure it is correct.
+ * 
+ * @warning If there is a configuration conflict, this function will modify some parameters. 
+ * The guiding behind these modifications is to maintain the highest performance of the output audio and results.
+ * And remove the conflict between different algorithms.
+ * 
+ * For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
+ * If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
+ * 
+ * @param afe_config       Input AFE config
+ * 
+ * @return afe_config_t*  The modified AFE config
+ */
+afe_config_t *afe_config_check(afe_config_t *afe_config);
+
+/**
+ * @brief Parse input format
+ * 
+ * @param input_format The input format, same with afe_config_init() function
+ * @param pcm_config   The pcm config
+ * 
+ * @return true if the input format is parsed successfully, otherwise false
+ */
+bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config);
+
+/**
+ * @brief Parse I2S input data
+ * 
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param mic_data     The output microphone data
+ * @param ref_data     The output playback reference data
+ * @param pcm_config   The pcm config
+ * 
+ */
+void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config);
+
+/**
+ * @brief Parse input data, from interleaved arrangement to contiguous arrangement
+ * 
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param channel_num  The channel number of data
+ * @param out_data     The output data
+ * 
+ */
+void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
+
+/**
+ * @brief Format input data, from contiguous arrangement to interleaved arrangement
+ * 
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param channel_num  The channel number of data
+ * @param out_data     The output data
+ * 
+ */
+void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
+
+/**
+ * @brief Adjust the gain of input data
+ * 
+ * @warning the input data will be modified inplace.
+ * 
+ * @param data         The input audio data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param factor       The gain factor
+ * 
+ * @return int16_t*    The output audio data
+ */
+int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
+
+/**
+ * @brief Adjust the gain of input data
+ * 
+ * @warning the input data will be modified inplace.
+ * 
+ * @param in_data         The input audio data
+ * @param in_frame_size   Input data frame size of input
+ * @param channel_num     The channel number of input data, which is same as output data
+ * @param out_data        The output audio data
+ * @param out_frame_size  Onput data frame size of input
+ *
+ */
+void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size);
+
+/**
+ * @brief Copy the afe config
+ * 
+ * @param dst_config    The destination afe config
+ * @param src_config    The source afe config
+ * 
+ * @return   The destination afe config
+ */
+afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
+
+/**
+ * @brief Print the afe config
+ * 
+ * @param afe_config    The afe config
+ */
+void afe_config_print(const afe_config_t *afe_config);
+
+/**
+ * @brief Allocate afe config
+ * 
+ * @return The afe config pointer
+ */
+afe_config_t *afe_config_alloc();
+
+/**
+ * @brief Free afe config
+ * 
+ * @param afe_config  The afe config pointer
+ */
+void afe_config_free(afe_config_t *afe_config);

 #ifdef __cplusplus
 }
--- a/include/esp32p4/esp_afe_sr_iface.h
+++ b/include/esp32p4/esp_afe_sr_iface.h
@ -1,7 +1,10 @@
 #pragma once
 #include "stdint.h"
+#include "stdlib.h"
+#include "stdbool.h"
 #include "esp_afe_config.h"
-
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -13,13 +16,15 @@ extern "C" {
 //Opaque AFE_SR data container
 typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;

+
+
 /**
 * @brief The state of vad
 */
 typedef enum
 {
-    AFE_VAD_SILENCE = 0,                    // noise or silence
-    AFE_VAD_SPEECH                          // speech
+    AFE_VAD_SILENCE = 0,                    // Deprecated, please use vad_state_t, noise or silence
+    AFE_VAD_SPEECH = 1                      // Deprecated, please use vad_state_t, speech
 } afe_vad_state_t;

 /**
@ -27,7 +32,7 @@ typedef enum
 */
 typedef struct afe_fetch_result_t
 {
-    int16_t *data;                          // the data of audio.
+    int16_t *data;                          // the target channel data of audio.
    int data_size;                          // the size of data. The unit is byte.
    int16_t *vad_cache;                     // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
    int vad_cache_size;                     // the size of vad_cache. The unit is byte.
@ -36,10 +41,12 @@ typedef struct afe_fetch_result_t
    wakenet_state_t wakeup_state;           // the value is wakenet_state_t
    int wake_word_index;                    // if the wake word is detected. It will store the wake word index which start from 1.
    int wakenet_model_index;                // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
-    afe_vad_state_t vad_state;              // the value is afe_vad_state_t
+    vad_state_t vad_state;              // the value is afe_vad_state_t
    int trigger_channel_id;                 // the channel index of output
    int wake_word_length;                   // the length of wake word. The unit is the number of samples.
    int ret_value;                          // the return state of fetch function
+    int16_t *raw_data;                      // the multi-channel output data of audio.
+    int raw_data_channels;                  // the channel number of raw data
    void* reserved;                         // reserved for future use
 } afe_fetch_result_t;

@ -63,19 +70,11 @@ typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_confi
 typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);

 /**
- * @brief Get the total channel number which be config
+ * @brief Get the channel number
 * 
 * @param afe   The AFE_SR object to query
 * @return      The amount of total channels
 */
-typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe);
-
-/**
- * @brief Get the mic channel number which be config
- * 
- * @param afe   The AFE_SR object to query
- * @return      The amount of mic channels
- */
 typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);

 /**
@ -104,12 +103,24 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t*
 * @brief fetch enhanced samples of an audio stream from the AFE_SR
 *
 * @Warning  The output is single channel data, no matter how many channels the input is.
+ *           Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
 *
 * @param afe   The AFE_SR object to query
 * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
 */
 typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);

+/**
+ * @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
+ *
+ * @Warning  The output is single channel data, no matter how many channels the input is.
+ *
+ * @param afe            The AFE_SR object to query
+ * @param ticks_to_wait  The timeout value, in ticks, to wait for the fetch result.
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
+ */
+typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
+
 /**
 * @brief reset ringbuf of AFE.
 *
@ -129,52 +140,37 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
 typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name);

 /**
- * @brief Disable wakenet model.
+ * @brief Enable VAD algorithm.
 *
 * @param afe          The AFE_SR object to query
 * @return             -1: fail, 0: disabled, 1: enabled
 */
-typedef int (*esp_afe_sr_iface_op_disable_wakenet_t)(esp_afe_sr_data_t *afe);
+typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe);

 /**
- * @brief Enable wakenet model.
+ * @brief Disable one function/module/algorithm.
 *
 * @param afe          The AFE_SR object to query
 * @return             -1: fail, 0: disabled, 1: enabled
 */
-typedef int (*esp_afe_sr_iface_op_enable_wakenet_t)(esp_afe_sr_data_t *afe);
+typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe);

 /**
- * @brief Disable AEC algorithm.
+ * @brief Enable one function/module/algorithm.
 *
 * @param afe          The AFE_SR object to query
 * @return             -1: fail, 0: disabled, 1: enabled
 */
-typedef int (*esp_afe_sr_iface_op_disable_aec_t)(esp_afe_sr_data_t *afe);
+typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe);

 /**
- * @brief Enable AEC algorithm.
+ * @brief Print all functions/modules/algorithms pipeline.
+ *       The pipeline is the order of the functions/modules/algorithms.
+ *       The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output]
 *
 * @param afe          The AFE_SR object to query
- * @return             -1: fail, 0: disabled, 1: enabled
 */
-typedef int (*esp_afe_sr_iface_op_enable_aec_t)(esp_afe_sr_data_t *afe);
-
-/**
- * @brief Disable SE algorithm.
- *
- * @param afe          The AFE_SR object to query
- * @return             -1: fail, 0: disabled, 1: enabled
- */
-typedef int (*esp_afe_sr_iface_op_disable_se_t)(esp_afe_sr_data_t *afe);
-
-/**
- * @brief Enable SE algorithm.
- *
- * @param afe          The AFE_SR object to query
- * @return             -1: fail, 0: disabled, 1: enabled
- */
-typedef int (*esp_afe_sr_iface_op_enable_se_t)(esp_afe_sr_data_t *afe);
+typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);

 /**
 * @brief Destroy a AFE_SR instance
@ -191,22 +187,41 @@ typedef struct {
    esp_afe_sr_iface_op_create_from_config_t create_from_config;
    esp_afe_sr_iface_op_feed_t feed;
    esp_afe_sr_iface_op_fetch_t fetch;
+    esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay;
    esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
    esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
    esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
-    esp_afe_sr_iface_op_get_total_channel_num_t get_total_channel_num;
-    esp_afe_sr_iface_op_get_channel_num_t get_channel_num;
+    esp_afe_sr_iface_op_get_channel_num_t get_channel_num;        // same with get_feed_channel_num
+    esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
+    esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
    esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
    esp_afe_sr_iface_op_set_wakenet_t  set_wakenet; 
-    esp_afe_sr_iface_op_disable_wakenet_t disable_wakenet;
-    esp_afe_sr_iface_op_enable_wakenet_t enable_wakenet;
-    esp_afe_sr_iface_op_disable_aec_t disable_aec;
-    esp_afe_sr_iface_op_enable_aec_t enable_aec;
-    esp_afe_sr_iface_op_disable_se_t disable_se;
-    esp_afe_sr_iface_op_enable_se_t enable_se;
+    esp_afe_sr_iface_op_disable_func_t disable_wakenet;
+    esp_afe_sr_iface_op_enable_func_t enable_wakenet;
+    esp_afe_sr_iface_op_disable_func_t disable_aec;
+    esp_afe_sr_iface_op_enable_func_t enable_aec;
+    esp_afe_sr_iface_op_disable_func_t disable_se;
+    esp_afe_sr_iface_op_enable_func_t enable_se;
+    esp_afe_sr_iface_op_disable_func_t disable_vad;
+    esp_afe_sr_iface_op_enable_func_t enable_vad;
+    esp_afe_sr_iface_op_disable_func_t disable_ns;
+    esp_afe_sr_iface_op_enable_func_t enable_ns;
+    esp_afe_sr_iface_op_disable_func_t disable_agc;
+    esp_afe_sr_iface_op_enable_func_t enable_agc;
+    esp_afe_sr_iface_op_print_pipeline_t print_pipeline;
    esp_afe_sr_iface_op_destroy_t destroy;
 } esp_afe_sr_iface_t;

+
+// struct is used to store the AFE handle and data for the AFE task
+typedef struct 
+{
+    esp_afe_sr_data_t *afe_data;
+    esp_afe_sr_iface_t *afe_handle;
+    TaskHandle_t feed_task; 
+    TaskHandle_t fetch_task; 
+}afe_task_into_t;
+
 #ifdef __cplusplus
 }
 #endif
--- a/include/esp32p4/esp_afe_sr_models.h
+++ b/include/esp32p4/esp_afe_sr_models.h
@ -6,17 +6,7 @@ extern "C" {

 #include "esp_afe_sr_iface.h"

-
-#if CONFIG_AFE_INTERFACE_V1
-extern const esp_afe_sr_iface_t esp_afe_sr_v1;
-extern const esp_afe_sr_iface_t esp_afe_vc_v1;
-#define ESP_AFE_SR_HANDLE esp_afe_sr_v1
-#define ESP_AFE_VC_HANDLE esp_afe_vc_v1
-
-#else
-#error No valid afe selected.
-#endif
-
+esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);

 #ifdef __cplusplus
 }
--- a/include/esp32p4/esp_agc.h
+++ b/include/esp32p4/esp_agc.h
@ -26,8 +26,15 @@ typedef enum {
    ESP_AGC_FRAME_SIZE_ERROR = -3,   ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size
 } ESP_AGE_ERR;

+typedef enum {
+    AGC_MODE_SR = -1,      // Bypass WEBRTC AGC
+    AGC_MODE_0 = 0,        // Only saturation protection
+    AGC_MODE_1 = 1,        // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
+    AGC_MODE_2 = 2,        // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
+    AGC_MODE_3 = 3,        // Fixed Digital Gain [compressionGaindB (default 8 dB)]
+} agc_mode_t;

-void *esp_agc_open(int agc_mode, int sample_rate);
+void *esp_agc_open(agc_mode_t agc_mode, int sample_rate);
 void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs);
 int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate);
 void esp_agc_close(void *agc_handle);
--- a/include/esp32p4/esp_vad.h
+++ b/include/esp32p4/esp_vad.h
@ -78,6 +78,8 @@ vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
 typedef struct {
    vad_trigger_t *trigger;
    void *vad_inst;
+    int sample_rate;
+    int frame_size;
 }vad_handle_with_trigger_t;

 typedef vad_handle_with_trigger_t* vad_handle_t;
@ -100,31 +102,41 @@ vad_handle_t vad_create(vad_mode_t vad_mode);
 * @brief Creates an instance to the VAD structure.
 *
 * @param vad_mode          Sets the VAD operating mode.
- * @param min_speech_len    Minimum frame number of speech duration
- * @param min_noise_len     Minimum frame number of noise duration
+ * @param sample_rate       Sample rate in Hz
+ * @param one_frame_ms      Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30.
+ * @param min_speech_ms     Minimum speech duration, unit is ms
+ * @param min_noise_ms      Minimum noise duration, unit is ms
 * @return
 *         - NULL: Create failed
 *         - Others: The instance of VAD
 */
-vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len);
+vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len);

 /**
 * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
 *
- * @param inst      The instance of VAD.
- *
- * @param data      An array of 16-bit signed audio samples.
- *
+ * @param handle            The instance of VAD.
+ * @param data              An array of 16-bit signed audio samples.
 * @param sample_rate_hz    The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
- *
 * @param one_frame_ms      The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
- *
 * @return
 *         - VAD_SILENCE if no voice
 *         - VAD_SPEECH  if voice is detected
 *
 */
-vad_state_t vad_process(vad_handle_t inst, int16_t *data, int sample_rate_hz, int one_frame_ms);
+vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms);
+
+/**
+ * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
+ *
+ * @param handle            The instance of VAD.
+ * @param data              An array of 16-bit signed audio samples.
+ * @return
+ *         - VAD_SILENCE if no voice
+ *         - VAD_SPEECH  if voice is detected
+ *
+ */
+vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);

 /**
 * @brief Free the VAD instance
--- a/include/esp32p4/esp_vadn_iface.h
+++ b/include/esp32p4/esp_vadn_iface.h
@ -1,6 +1,7 @@
 #pragma once
 #include "esp_vad.h"
 #include "stdint.h"
+#include "dl_lib_convq_queue.h"

 #ifdef __cplusplus
 extern "C" {
@ -98,6 +99,25 @@ typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model
 */
 typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);

+/**
+ * @brief Feed MFCC of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param cq An array of 16-bit MFCC.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq);
+
+/**
+ * @brief Get MFCC of an audio stream
+ *
+ * @param model The model object to query
+ * @return MFCC data
+ */
+typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
+
 /**
 * @brief Get the triggered channel index. Channel index starts from zero
 *
@ -133,6 +153,8 @@ typedef struct {
    esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
    esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
    esp_vadn_iface_op_detect_t detect;
+    esp_vadn_iface_op_detect_mfcc_t detect_mfcc;
+    esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data;
    esp_vadn_iface_op_clean_t clean;
    esp_vadn_iface_op_destroy_t destroy;
 } esp_vadn_iface_t;
--- a/include/esp32p4/esp_webrtc.h
+++ b/include/esp32p4/esp_webrtc.h
@ -0,0 +1,90 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_WEBRTC_H_
+#define _ESP_WEBRTC_H_
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <stdint.h>
+#include "sr_ringbuf.h"
+#include "esp_log.h"
+#include "esp_agc.h"
+#include "esp_ns.h"
+
+#include "esp_heap_caps.h"
+
+typedef struct {
+    void* ns_handle;
+    void* agc_handle;
+    int frame_size;
+    int sample_rate;
+    int16_t *buff;
+    int16_t *out_data;
+    sr_ringbuf_handle_t rb;
+}webrtc_handle_t;
+
+/**
+ * @brief Creates an instance of webrtc.
+ * 
+ * @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms.
+ *
+ * @param frame_length_ms    The length of the audio processing
+ * @param ns_mode            The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive
+ * @param agc_mode           The model of AGC
+ * @param agc_gain           The gain of AGC. default is 9
+ * @param agc_target_level   The target level of AGC. default is -3 dbfs
+ * @param sample_rate        The sample rate of the audio. 
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of webrtc
+ */
+webrtc_handle_t* webrtc_create(
+    int frame_length_ms, 
+    int ns_mode, 
+    agc_mode_t agc_mode, 
+    int agc_gain, 
+    int agc_target_level, 
+    int sample_rate);
+
+/**
+ * @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression.
+ *
+ * @param handle        The instance of NS.
+ * @param in_data       An array of 16-bit signed audio samples.
+ * @param out_size      The sample size of output data
+ * @param enable_ns     Enable noise suppression
+ * @param enable_agc    Enable automatic gain control
+ *
+ * @return data after noise suppression
+ */
+int16_t* webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc);
+
+/**
+ * @brief Free the webrtc instance
+ *
+ * @param handle The instance of webrtc.
+ *
+ * @return None
+ *
+ */
+void webrtc_destroy(webrtc_handle_t *handle);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_NS_H_
--- a/include/esp32p4/esp_wn_iface.h
+++ b/include/esp32p4/esp_wn_iface.h
@ -1,5 +1,6 @@
 #pragma once
 #include "stdint.h"
+#include "dl_lib_convq_queue.h"

 #ifdef __cplusplus
 extern "C" {
@ -167,6 +168,25 @@ typedef void (*esp_wn_iface_op_clean_t)(model_iface_data_t *model);
 */
 typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model);

+/**
+ * @brief Feed MFCC of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param cq An array of 16-bit MFCC.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef wakenet_state_t (*esp_wn_iface_op_detect_mfcc_t)(model_iface_data_t *model, int16_t *samples, dl_convq_queue_t *cq);
+
+/**
+ * @brief Get MFCC of an audio stream
+ *
+ * @param model The model object to query
+ * @return MFCC data
+ */
+typedef dl_convq_queue_t* (*esp_wn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
+

 /**
 * This structure contains the functions used to do operations on a wake word detection model.
@ -184,6 +204,8 @@ typedef struct {
    esp_wn_iface_op_get_triggered_channel_t  get_triggered_channel;
    esp_wn_iface_op_get_vol_gain_t get_vol_gain;
    esp_wn_iface_op_detect_t detect;
+    esp_wn_iface_op_detect_mfcc_t detect_mfcc;
+    esp_wn_iface_op_get_mfcc_data_t get_mfcc_data;
    esp_wn_iface_op_clean_t clean;
    esp_wn_iface_op_destroy_t destroy;
 } esp_wn_iface_t;
--- a/include/esp32s3/esp_aec.h
+++ b/include/esp32s3/esp_aec.h
@ -21,80 +21,72 @@ extern "C" {
 #endif

 #define USE_AEC_FFT                      // Not kiss_fft
-#define AEC_USE_SPIRAM      0
 #define AEC_SAMPLE_RATE     16000        // Only Support 16000Hz
-//#define AEC_FRAME_LENGTH_MS 16
 #define AEC_FRAME_LENGTH_MS 32
-#define AEC_FILTER_LENGTH   1200         // Number of samples of echo to cancel

-typedef void* aec_handle_t;
+typedef struct aec_handle_t aec_handle_t;
+typedef enum {
+    AEC_MODE_SR_LOW_COST = 0,     // Low Cost AEC fro speech recognition
+    AEC_MODE_SR_HIGH_PERF = 1,    // High Perforamce AEC for speech recognition
+    AEC_MODE_VOIP_LOW_COST = 3,   // Low Cost AEC for voice communication
+    AEC_MODE_VOIP_HIGH_PERF = 4,  // High Perforamce AEC for voice communication
+} aec_mode_t;

 /**
 * @brief Creates an instance to the AEC structure.
+ * Please get frame size by aec_get_chunksize() function
 * 
- * @deprecated This API will be deprecated after version 1.0, please use aec_pro_create
- *
 * @param sample_rate       The Sampling frequency (Hz) must be 16000.
- *
- * @param frame_length      The length of the audio processing must be 16ms.
- *
- * @param filter_length     Number of samples of echo to cancel.
- *
+ * @param filter_length     Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
+ * @param channel_num       The input microphone channel number
+ * @param mode              The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
 * @return
 *         - NULL: Create failed
 *         - Others: The instance of AEC
 */
-aec_handle_t aec_create(int sample_rate, int frame_length, int filter_length);
+aec_handle_t *aec_create(int sample_rate, int filter_length, int channel_num, aec_mode_t mode);

 /**
- * @brief Creates an instance to the AEC structure.
+ * @brief Creates an instance to the AEC structure, same with aec_create().
 * 
- * @deprecated This API will be deprecated after version 1.0, please use aec_pro_create
- *
- * @param sample_rate       The Sampling frequency (Hz) must be 16000.
- *
- * @param frame_length      The length of the audio processing must be 16ms.
- *
- * @param filter_length     Number of samples of echo to cancel.
- * 
- * @param nch               Number of input signal channel.
- *
+ * @param filter_length     Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
+ * @param channel_num       The input microphone channel number
+ * @param mode              The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
 * @return
 *         - NULL: Create failed
 *         - Others: The instance of AEC
 */
-aec_handle_t aec_create_multimic(int sample_rate, int frame_length, int filter_length, int nch);
-
-/**
- * @brief Creates an instance of more powerful AEC.
- *
- * @param frame_length      Length of input signal. Must be 16ms if mode is 0; otherwise could be 16ms or 32ms. Length of input signal to aec_process must be modified accordingly.
- *
- * @param nch               Number of microphones.
- *
- * @param mode              Mode of AEC (0 to 5), indicating aggressiveness and RAM allocation. 0: mild; 1 or 2: medium (1: internal RAM, 2: SPIRAM); 3 and 4: aggressive (3: internal RAM, 4: SPIRAM); 5: agressive, accelerated for ESP32-S3.
- *
- * @return
- *         - NULL: Create failed
- *         - Others: An Instance of AEC
- */
-aec_handle_t aec_pro_create(int frame_length, int nch, int mode);
+aec_handle_t *aec_pro_create(int filter_length, int channel_num, aec_mode_t mode);

 /**
 * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
 *
- * @param inst        The instance of AEC.
- *
+ * @warning The indata, refdata and outdata must be 16-bit signed. please allocate memory by heap_caps_aligned_alloc().
+ * 
+ * @param inst        The instance of AEC. Format for multi-channel data is "ch0 ch0 ch0 ..., ch1 ch1 ch1 ..."
 * @param indata      An array of 16-bit signed audio samples from mic.
- *
 * @param refdata     An array of 16-bit signed audio samples sent to the speaker.
- *
- * @param outdata     Returns near-end signal with echo removed.
- *
+ * @param outdata     Returns near-end signal with echo removed. Format for multi-channel data is "ch0 ch0 ch0..., ch1 ch1 ch1 ..."
 * @return None
 *
 */
-void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int16_t *outdata);
+void aec_process(const aec_handle_t *handel, int16_t *indata, int16_t *refdata, int16_t *outdata);
+
+/**
+ * @brief Get frame size of AEC (the samples of one frame)
+ * @param handle The instance of AEC.
+ * @return Frame size
+ */
+int aec_get_chunksize(const aec_handle_t *handle);
+
+/**
+ * @brief Get AEC mode string 
+ * 
+ * @param aec_mode  The mode of AEC.
+ * 
+ * @return AEC mode string
+ */
+char * aec_get_mode_string(aec_mode_t aec_mode);

 /**
 * @brief Free the AEC instance
@ -104,7 +96,7 @@ void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int
 * @return None
 *
 */
-void aec_destroy(aec_handle_t inst);
+void aec_destroy(aec_handle_t *handel);

 #ifdef __cplusplus
 }
--- a/include/esp32s3/esp_afe_config.h
+++ b/include/esp32s3/esp_afe_config.h
@ -1,24 +1,41 @@
 #pragma once
 #include "stdint.h"
+#include "stdbool.h"
+#include "stdlib.h"
 #include "esp_wn_iface.h"
 #include "esp_wn_models.h"
 #include "esp_vad.h"
-
+#include "esp_aec.h"
+#include "esp_agc.h"
+#include "model_path.h"
+#include "esp_vadn_models.h"
+#include "esp_nsn_models.h"
 #ifdef __cplusplus
 extern "C" {
 #endif

 //AFE: Audio Front-End 
 //SR:  Speech Recognition
-//afe_sr/AFE_SR: the audio front-end for speech recognition
-
+//VC:  Voice Communication

 //Set AFE_SR mode
 typedef enum {
-    SR_MODE_LOW_COST = 0,
-    SR_MODE_HIGH_PERF = 1
+    SR_MODE_LOW_COST = 0,     //Deprecated, please use afe_mode_t, AFE mode: low cost mode
+    SR_MODE_HIGH_PERF = 1,    //Deprecated, please use afe_mode_t, AFE mode: high performance mode
 } afe_sr_mode_t;

+//Set AFE mode
+typedef enum {
+    AFE_MODE_LOW_COST = 0,    // AFE mode: low cost mode
+    AFE_MODE_HIGH_PERF = 1,   // AFE mode: high performance mode
+} afe_mode_t;
+
+//Set AFE type
+typedef enum {
+    AFE_TYPE_SR = 0,   // Speech recognition scenarios, excluding nonlinear noise suppression
+    AFE_TYPE_VC = 1,   // Voice communication scenarios, including nonlinear noise suppression
+} afe_type_t;
+
 typedef enum {
    AFE_MEMORY_ALLOC_MORE_INTERNAL = 1,             // malloc with more internal ram
    AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2,    // malloc with internal ram and psram in balance
@ -26,24 +43,30 @@ typedef enum {
 } afe_memory_alloc_mode_t;

 typedef enum {
-    AFE_MN_PEAK_AGC_MODE_1 = -9,            // The peak amplitude of audio fed to multinet is -9dB
-    AFE_MN_PEAK_AGC_MODE_2 = -6,            // The peak amplitude of audio fed to multinet is -6dB
-    AFE_MN_PEAK_AGC_MODE_3 = -3,            // The peak amplitude of audio fed to multinet is -3dB
+    AFE_MN_PEAK_AGC_MODE_1 = -9,            // The peak amplitude of fetch audio is -9dB
+    AFE_MN_PEAK_AGC_MODE_2 = -6,            // The peak amplitude of fetch audio is -6dB
+    AFE_MN_PEAK_AGC_MODE_3 = -3,            // The peak amplitude of fetcg is -3dB
    AFE_MN_PEAK_NO_AGC = 0,                 // There is no agc gain
 } afe_mn_peak_agc_mode_t;

 typedef struct {
-    int total_ch_num;                       // total channel num. It must be: total_ch_num = mic_num + ref_num
-    int mic_num;                            // mic channel num
-    int ref_num;                            // reference channel num
-    int sample_rate;                        // sample rate of audio
+    int total_ch_num;                        // total channel num, include microphone channel, playback channel and unknown channel
+    int mic_num;                             // microphone channel number
+    uint8_t* mic_ids;                        // microphone channel indices
+    int ref_num;                             // playback reference channel number
+    uint8_t* ref_ids;                        // playback reference channel indices
+    int sample_rate;                         // sample rate of audio
 } afe_pcm_config_t;

 typedef enum {
-    NS_MODE_SSP = 0,                        // speech signal process method
-    NS_MODE_NET = 1,                        // deep noise suppression net method
+    AFE_NS_MODE_WEBRTC = 0,                        // please use model name of NS, SSP: "WEBRTC"
+    AFE_NS_MODE_NET = 1,                        // please use model name of NSNET
 } afe_ns_mode_t;

+typedef enum {
+    AFE_AGC_MODE_WEBRTC = 0,                        // WEBRTC AGC
+    AFE_AGC_MODE_WAKENET = 1,                       // AGC gain is calculated by wakenet model if wakenet is activated
+} afe_agc_mode_t;

 /**
 * @brief Function to get the debug audio data
@ -66,148 +89,192 @@ typedef struct {
 } afe_debug_hook_t;

 typedef struct {
-    bool aec_init;
-    bool se_init;
-    bool vad_init;
+    /********** AEC(Acoustic Echo Cancellation) **********/
+    bool aec_init;                          // Whether to init aec
+    aec_mode_t aec_mode;                    // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
+    int aec_filter_length;                  // The filter length of aec
+
+    /********** SE(Speech Enhancement, microphone array processing) **********/
+    bool se_init;                           // Whether to init se
+
+    /********** NS(Noise Suppression) **********/
+    bool ns_init;                           // Whether to init ns
+    char *ns_model_name;                    // Model name of ns
+    afe_ns_mode_t afe_ns_mode;              // Model mode of ns
+    
+    /********** VAD(Voice Activity Detection) **********/
+    bool vad_init;                           // Whether to init vad
+    vad_mode_t vad_mode;                     // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
+    char *vad_model_name;                    // The model name of vad, If it is null, WebRTC VAD will be used.
+    int vad_min_speech_ms;                   // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
+    int vad_min_noise_ms;                    // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms
+    bool vad_mute_playback;                  // If true, the playback will be muted for vad detection. default: false
+    bool vad_enable_channel_trigger;         // If true, the vad will be used to choose the channel id. default: false
+
+    /********** WakeNet(Wake Word Engine) **********/
    bool wakenet_init;
-    bool voice_communication_init;
-    bool voice_communication_agc_init;      // AGC swich for voice communication
-    int voice_communication_agc_gain;       // AGC gain(dB) for voice communication
-    vad_mode_t vad_mode;                    // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
    char *wakenet_model_name;               // The model name of wakenet 1
    char *wakenet_model_name_2;             // The model name of wakenet 2 if has wakenet 2
-    det_mode_t wakenet_mode;
-    afe_sr_mode_t afe_mode;
-    int afe_perferred_core;
-    int afe_perferred_priority;
-    int afe_ringbuf_size;
-    afe_memory_alloc_mode_t memory_alloc_mode;
-    float afe_linear_gain;                  // The linear gain for sr output(note: invaild for vc), the value should be in [0.1, 10.0]. 
-                                            // This value acts directly on the output amplitude: out_linear_gain * amplitude.
-    afe_mn_peak_agc_mode_t agc_mode;        // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
+    det_mode_t wakenet_mode;                // The mode of wakenet
+
+    /********** AGC(Automatic Gain Control) **********/
+    bool agc_init;                           // Whether to init agc
+    afe_agc_mode_t agc_mode;                     // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
+    int agc_compression_gain_db;             // Compression gain in dB (default 9)
+    int agc_target_level_dbfs;               // Target level in -dBfs of envelope (default -3)
+
+    /********** General AFE(Audio Front End) parameter **********/
    afe_pcm_config_t pcm_config;            // Config the channel num of original data which is fed to the afe feed function.
+    afe_mode_t afe_mode;                    // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    afe_type_t afe_type;                    // The mode of afe， AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+    int afe_perferred_core;                 // The preferred core of afe se task, which is created in afe_create function.
+    int afe_perferred_priority;             // The preferred priority of afe se task, which is created in afe_create function.
+    int afe_ringbuf_size;                   // The ring buffer size: the number of frame data in ring buffer.
+    afe_memory_alloc_mode_t memory_alloc_mode;  // The memory alloc mode for afe. From Internal RAM or PSRAM
+    float afe_linear_gain;                  // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude.
    bool debug_init;
-    afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX];
-    afe_ns_mode_t afe_ns_mode;
-    char *afe_ns_model_name;
    bool fixed_first_channel;                // If true, the channel after first wake-up is fixed to raw data of microphone
                                             // otherwise, select channel number by wakenet
-    char *vad_model_name;                    // The model name of vad, support vadnet1 and vadnet1_small
-    int vad_min_speech_ms;                   // The minimum duration of speech in ms. It should be bigger than 32 ms
-    int vad_min_noise_ms;                    // The minimum duration of noise/silence in ms. It should be bigger than 64 ms
-    bool vad_mute_playback;                  // If true, the playback will be muted for vad detection
 } afe_config_t;

+/**
+ * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format. 
+ * You can manually fine-tune it after creating the configuration
+ * 
+ * The input format: 
+ * M to represent the microphone channel
+ * R to represent the playback reference channel
+ * N to represent an unknown or unused channel
+ * 
+ * For example, input_format="MMNR" indicates that the input data consists of four channels, 
+ * which are the microphone channel, the microphone channel, an unused channel, and the playback channel
+ * 
+ * @param input_format     The input format
+ * @param models           Models from partition, which is configured by Kconfig
+ * @param type             The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
+ * @param mode             The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
+ * 
+ * @return afe_config_t*  The default config of afe
+ */
+afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);

-#if CONFIG_IDF_TARGET_ESP32
-#define AFE_CONFIG_DEFAULT() { \
-    .aec_init = true, \
-    .se_init = true, \
-    .vad_init = true, \
-    .wakenet_init = true, \
-    .voice_communication_init = false, \
-    .voice_communication_agc_init = false, \
-    .voice_communication_agc_gain = 15, \
-    .vad_mode = VAD_MODE_0, \
-    .wakenet_model_name = NULL, \
-    .wakenet_model_name_2 = NULL, \
-    .wakenet_mode = DET_MODE_90, \
-    .afe_mode = SR_MODE_HIGH_PERF, \
-    .afe_perferred_core = 0, \
-    .afe_perferred_priority = 5, \
-    .afe_ringbuf_size = 50, \
-    .memory_alloc_mode = AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE, \
-    .afe_linear_gain = 1.0, \
-    .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
-    .pcm_config = { \
-        .total_ch_num = 2, \
-        .mic_num = 1, \
-        .ref_num = 1, \
-        .sample_rate = 16000, \
-    }, \
-    .debug_init = false, \
-    .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
-    .afe_ns_mode = NS_MODE_SSP, \
-    .afe_ns_model_name = NULL, \
-    .fixed_first_channel = true, \
-    .vad_model_name = NULL, \
-    .vad_min_speech_ms = 64, \
-    .vad_min_noise_ms = 256, \
-    .vad_mute_playback = false, \
-}
-#elif CONFIG_IDF_TARGET_ESP32P4
-#define AFE_CONFIG_DEFAULT() { \
-    .aec_init = true, \
-    .se_init = true, \
-    .vad_init = true, \
-    .wakenet_init = true, \
-    .voice_communication_init = false, \
-    .voice_communication_agc_init = false, \
-    .voice_communication_agc_gain = 15, \
-    .vad_mode = VAD_MODE_0, \
-    .wakenet_model_name = NULL, \
-    .wakenet_model_name_2 = NULL, \
-    .wakenet_mode = DET_MODE_90, \
-    .afe_mode = SR_MODE_LOW_COST, \
-    .afe_perferred_core = 0, \
-    .afe_perferred_priority = 5, \
-    .afe_ringbuf_size = 50, \
-    .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
-    .afe_linear_gain = 1.0, \
-    .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
-    .pcm_config = { \
-        .total_ch_num = 2, \
-        .mic_num = 1, \
-        .ref_num = 1, \
-        .sample_rate = 16000, \
-    }, \
-    .debug_init = false, \
-    .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
-    .afe_ns_mode = NS_MODE_SSP, \
-    .afe_ns_model_name = NULL, \
-    .fixed_first_channel = true, \
-    .vad_model_name = NULL, \
-    .vad_min_speech_ms = 64, \
-    .vad_min_noise_ms = 256, \
-    .vad_mute_playback = false, \
-}
-#elif CONFIG_IDF_TARGET_ESP32S3
-#define AFE_CONFIG_DEFAULT() { \
-    .aec_init = true, \
-    .se_init = true, \
-    .vad_init = true, \
-    .wakenet_init = true, \
-    .voice_communication_init = false, \
-    .voice_communication_agc_init = false, \
-    .voice_communication_agc_gain = 15, \
-    .vad_mode = VAD_MODE_0, \
-    .wakenet_model_name = NULL, \
-    .wakenet_model_name_2 = NULL, \
-    .wakenet_mode = DET_MODE_2CH_90, \
-    .afe_mode = SR_MODE_LOW_COST, \
-    .afe_perferred_core = 0, \
-    .afe_perferred_priority = 5, \
-    .afe_ringbuf_size = 50, \
-    .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
-    .afe_linear_gain = 1.0, \
-    .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
-    .pcm_config = { \
-        .total_ch_num = 3, \
-        .mic_num = 2, \
-        .ref_num = 1, \
-        .sample_rate = 16000, \
-    }, \
-    .debug_init = false, \
-    .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
-    .afe_ns_mode = NS_MODE_SSP, \
-    .afe_ns_model_name = NULL, \
-    .fixed_first_channel = true, \
-    .vad_model_name = NULL, \
-    .vad_min_speech_ms = 64, \
-    .vad_min_noise_ms = 256, \
-    .vad_mute_playback = false, \
-}
-#endif
+/**
+ * @brief Check AFE configuration and make sure it is correct.
+ * 
+ * @warning If there is a configuration conflict, this function will modify some parameters. 
+ * The guiding behind these modifications is to maintain the highest performance of the output audio and results.
+ * And remove the conflict between different algorithms.
+ * 
+ * For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
+ * If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
+ * 
+ * @param afe_config       Input AFE config
+ * 
+ * @return afe_config_t*  The modified AFE config
+ */
+afe_config_t *afe_config_check(afe_config_t *afe_config);
+
+/**
+ * @brief Parse input format
+ * 
+ * @param input_format The input format, same with afe_config_init() function
+ * @param pcm_config   The pcm config
+ * 
+ * @return true if the input format is parsed successfully, otherwise false
+ */
+bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config);
+
+/**
+ * @brief Parse I2S input data
+ * 
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param mic_data     The output microphone data
+ * @param ref_data     The output playback reference data
+ * @param pcm_config   The pcm config
+ * 
+ */
+void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config);
+
+/**
+ * @brief Parse input data, from interleaved arrangement to contiguous arrangement
+ * 
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param channel_num  The channel number of data
+ * @param out_data     The output data
+ * 
+ */
+void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
+
+/**
+ * @brief Format input data, from contiguous arrangement to interleaved arrangement
+ * 
+ * @param data         The input multi channel data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param channel_num  The channel number of data
+ * @param out_data     The output data
+ * 
+ */
+void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
+
+/**
+ * @brief Adjust the gain of input data
+ * 
+ * @warning the input data will be modified inplace.
+ * 
+ * @param data         The input audio data
+ * @param frame_size   The frame size of input, it is also the size of single channel data
+ * @param factor       The gain factor
+ * 
+ * @return int16_t*    The output audio data
+ */
+int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
+
+/**
+ * @brief Adjust the gain of input data
+ * 
+ * @warning the input data will be modified inplace.
+ * 
+ * @param in_data         The input audio data
+ * @param in_frame_size   Input data frame size of input
+ * @param channel_num     The channel number of input data, which is same as output data
+ * @param out_data        The output audio data
+ * @param out_frame_size  Onput data frame size of input
+ *
+ */
+void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size);
+
+/**
+ * @brief Copy the afe config
+ * 
+ * @param dst_config    The destination afe config
+ * @param src_config    The source afe config
+ * 
+ * @return   The destination afe config
+ */
+afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
+
+/**
+ * @brief Print the afe config
+ * 
+ * @param afe_config    The afe config
+ */
+void afe_config_print(const afe_config_t *afe_config);
+
+/**
+ * @brief Allocate afe config
+ * 
+ * @return The afe config pointer
+ */
+afe_config_t *afe_config_alloc();
+
+/**
+ * @brief Free afe config
+ * 
+ * @param afe_config  The afe config pointer
+ */
+void afe_config_free(afe_config_t *afe_config);

 #ifdef __cplusplus
 }
--- a/include/esp32s3/esp_afe_sr_iface.h
+++ b/include/esp32s3/esp_afe_sr_iface.h
@ -1,7 +1,10 @@
 #pragma once
 #include "stdint.h"
+#include "stdlib.h"
+#include "stdbool.h"
 #include "esp_afe_config.h"
-
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
@ -13,13 +16,15 @@ extern "C" {
 //Opaque AFE_SR data container
 typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;

+
+
 /**
 * @brief The state of vad
 */
 typedef enum
 {
-    AFE_VAD_SILENCE = 0,                    // noise or silence
-    AFE_VAD_SPEECH                          // speech
+    AFE_VAD_SILENCE = 0,                    // Deprecated, please use vad_state_t, noise or silence
+    AFE_VAD_SPEECH = 1                      // Deprecated, please use vad_state_t, speech
 } afe_vad_state_t;

 /**
@ -27,7 +32,7 @@ typedef enum
 */
 typedef struct afe_fetch_result_t
 {
-    int16_t *data;                          // the data of audio.
+    int16_t *data;                          // the target channel data of audio.
    int data_size;                          // the size of data. The unit is byte.
    int16_t *vad_cache;                     // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
    int vad_cache_size;                     // the size of vad_cache. The unit is byte.
@ -36,10 +41,12 @@ typedef struct afe_fetch_result_t
    wakenet_state_t wakeup_state;           // the value is wakenet_state_t
    int wake_word_index;                    // if the wake word is detected. It will store the wake word index which start from 1.
    int wakenet_model_index;                // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
-    afe_vad_state_t vad_state;              // the value is afe_vad_state_t
+    vad_state_t vad_state;              // the value is afe_vad_state_t
    int trigger_channel_id;                 // the channel index of output
    int wake_word_length;                   // the length of wake word. The unit is the number of samples.
    int ret_value;                          // the return state of fetch function
+    int16_t *raw_data;                      // the multi-channel output data of audio.
+    int raw_data_channels;                  // the channel number of raw data
    void* reserved;                         // reserved for future use
 } afe_fetch_result_t;

@ -63,19 +70,11 @@ typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_confi
 typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);

 /**
- * @brief Get the total channel number which be config
+ * @brief Get the channel number
 * 
 * @param afe   The AFE_SR object to query
 * @return      The amount of total channels
 */
-typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe);
-
-/**
- * @brief Get the mic channel number which be config
- * 
- * @param afe   The AFE_SR object to query
- * @return      The amount of mic channels
- */
 typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);

 /**
@ -104,12 +103,24 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t*
 * @brief fetch enhanced samples of an audio stream from the AFE_SR
 *
 * @Warning  The output is single channel data, no matter how many channels the input is.
+ *           Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
 *
 * @param afe   The AFE_SR object to query
 * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
 */
 typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);

+/**
+ * @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
+ *
+ * @Warning  The output is single channel data, no matter how many channels the input is.
+ *
+ * @param afe            The AFE_SR object to query
+ * @param ticks_to_wait  The timeout value, in ticks, to wait for the fetch result.
+ * @return      The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
+ */
+typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
+
 /**
 * @brief reset ringbuf of AFE.
 *
@ -129,52 +140,37 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
 typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name);

 /**
- * @brief Disable wakenet model.
+ * @brief Enable VAD algorithm.
 *
 * @param afe          The AFE_SR object to query
 * @return             -1: fail, 0: disabled, 1: enabled
 */
-typedef int (*esp_afe_sr_iface_op_disable_wakenet_t)(esp_afe_sr_data_t *afe);
+typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe);

 /**
- * @brief Enable wakenet model.
+ * @brief Disable one function/module/algorithm.
 *
 * @param afe          The AFE_SR object to query
 * @return             -1: fail, 0: disabled, 1: enabled
 */
-typedef int (*esp_afe_sr_iface_op_enable_wakenet_t)(esp_afe_sr_data_t *afe);
+typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe);

 /**
- * @brief Disable AEC algorithm.
+ * @brief Enable one function/module/algorithm.
 *
 * @param afe          The AFE_SR object to query
 * @return             -1: fail, 0: disabled, 1: enabled
 */
-typedef int (*esp_afe_sr_iface_op_disable_aec_t)(esp_afe_sr_data_t *afe);
+typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe);

 /**
- * @brief Enable AEC algorithm.
+ * @brief Print all functions/modules/algorithms pipeline.
+ *       The pipeline is the order of the functions/modules/algorithms.
+ *       The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output]
 *
 * @param afe          The AFE_SR object to query
- * @return             -1: fail, 0: disabled, 1: enabled
 */
-typedef int (*esp_afe_sr_iface_op_enable_aec_t)(esp_afe_sr_data_t *afe);
-
-/**
- * @brief Disable SE algorithm.
- *
- * @param afe          The AFE_SR object to query
- * @return             -1: fail, 0: disabled, 1: enabled
- */
-typedef int (*esp_afe_sr_iface_op_disable_se_t)(esp_afe_sr_data_t *afe);
-
-/**
- * @brief Enable SE algorithm.
- *
- * @param afe          The AFE_SR object to query
- * @return             -1: fail, 0: disabled, 1: enabled
- */
-typedef int (*esp_afe_sr_iface_op_enable_se_t)(esp_afe_sr_data_t *afe);
+typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);

 /**
 * @brief Destroy a AFE_SR instance
@ -191,22 +187,41 @@ typedef struct {
    esp_afe_sr_iface_op_create_from_config_t create_from_config;
    esp_afe_sr_iface_op_feed_t feed;
    esp_afe_sr_iface_op_fetch_t fetch;
+    esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay;
    esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
    esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
    esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
-    esp_afe_sr_iface_op_get_total_channel_num_t get_total_channel_num;
-    esp_afe_sr_iface_op_get_channel_num_t get_channel_num;
+    esp_afe_sr_iface_op_get_channel_num_t get_channel_num;        // same with get_feed_channel_num
+    esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
+    esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
    esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
    esp_afe_sr_iface_op_set_wakenet_t  set_wakenet; 
-    esp_afe_sr_iface_op_disable_wakenet_t disable_wakenet;
-    esp_afe_sr_iface_op_enable_wakenet_t enable_wakenet;
-    esp_afe_sr_iface_op_disable_aec_t disable_aec;
-    esp_afe_sr_iface_op_enable_aec_t enable_aec;
-    esp_afe_sr_iface_op_disable_se_t disable_se;
-    esp_afe_sr_iface_op_enable_se_t enable_se;
+    esp_afe_sr_iface_op_disable_func_t disable_wakenet;
+    esp_afe_sr_iface_op_enable_func_t enable_wakenet;
+    esp_afe_sr_iface_op_disable_func_t disable_aec;
+    esp_afe_sr_iface_op_enable_func_t enable_aec;
+    esp_afe_sr_iface_op_disable_func_t disable_se;
+    esp_afe_sr_iface_op_enable_func_t enable_se;
+    esp_afe_sr_iface_op_disable_func_t disable_vad;
+    esp_afe_sr_iface_op_enable_func_t enable_vad;
+    esp_afe_sr_iface_op_disable_func_t disable_ns;
+    esp_afe_sr_iface_op_enable_func_t enable_ns;
+    esp_afe_sr_iface_op_disable_func_t disable_agc;
+    esp_afe_sr_iface_op_enable_func_t enable_agc;
+    esp_afe_sr_iface_op_print_pipeline_t print_pipeline;
    esp_afe_sr_iface_op_destroy_t destroy;
 } esp_afe_sr_iface_t;

+
+// struct is used to store the AFE handle and data for the AFE task
+typedef struct 
+{
+    esp_afe_sr_data_t *afe_data;
+    esp_afe_sr_iface_t *afe_handle;
+    TaskHandle_t feed_task; 
+    TaskHandle_t fetch_task; 
+}afe_task_into_t;
+
 #ifdef __cplusplus
 }
 #endif
--- a/include/esp32s3/esp_afe_sr_models.h
+++ b/include/esp32s3/esp_afe_sr_models.h
@ -6,17 +6,7 @@ extern "C" {

 #include "esp_afe_sr_iface.h"

-
-#if CONFIG_AFE_INTERFACE_V1
-extern const esp_afe_sr_iface_t esp_afe_sr_v1;
-extern const esp_afe_sr_iface_t esp_afe_vc_v1;
-#define ESP_AFE_SR_HANDLE esp_afe_sr_v1
-#define ESP_AFE_VC_HANDLE esp_afe_vc_v1
-
-#else
-#error No valid afe selected.
-#endif
-
+esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);

 #ifdef __cplusplus
 }
--- a/include/esp32s3/esp_agc.h
+++ b/include/esp32s3/esp_agc.h
@ -26,8 +26,15 @@ typedef enum {
    ESP_AGC_FRAME_SIZE_ERROR = -3,   ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size
 } ESP_AGE_ERR;

+typedef enum {
+    AGC_MODE_SR = -1,      // Bypass WEBRTC AGC
+    AGC_MODE_0 = 0,        // Only saturation protection
+    AGC_MODE_1 = 1,        // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
+    AGC_MODE_2 = 2,        // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
+    AGC_MODE_3 = 3,        // Fixed Digital Gain [compressionGaindB (default 8 dB)]
+} agc_mode_t;

-void *esp_agc_open(int agc_mode, int sample_rate);
+void *esp_agc_open(agc_mode_t agc_mode, int sample_rate);
 void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs);
 int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate);
 void esp_agc_close(void *agc_handle);
--- a/include/esp32s3/esp_vad.h
+++ b/include/esp32s3/esp_vad.h
@ -78,6 +78,8 @@ vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
 typedef struct {
    vad_trigger_t *trigger;
    void *vad_inst;
+    int sample_rate;
+    int frame_size;
 }vad_handle_with_trigger_t;

 typedef vad_handle_with_trigger_t* vad_handle_t;
@ -100,31 +102,41 @@ vad_handle_t vad_create(vad_mode_t vad_mode);
 * @brief Creates an instance to the VAD structure.
 *
 * @param vad_mode          Sets the VAD operating mode.
- * @param min_speech_len    Minimum frame number of speech duration
- * @param min_noise_len     Minimum frame number of noise duration
+ * @param sample_rate       Sample rate in Hz
+ * @param one_frame_ms      Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30.
+ * @param min_speech_ms     Minimum speech duration, unit is ms
+ * @param min_noise_ms      Minimum noise duration, unit is ms
 * @return
 *         - NULL: Create failed
 *         - Others: The instance of VAD
 */
-vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len);
+vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len);

 /**
 * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
 *
- * @param inst      The instance of VAD.
- *
- * @param data      An array of 16-bit signed audio samples.
- *
+ * @param handle            The instance of VAD.
+ * @param data              An array of 16-bit signed audio samples.
 * @param sample_rate_hz    The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
- *
 * @param one_frame_ms      The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
- *
 * @return
 *         - VAD_SILENCE if no voice
 *         - VAD_SPEECH  if voice is detected
 *
 */
-vad_state_t vad_process(vad_handle_t inst, int16_t *data, int sample_rate_hz, int one_frame_ms);
+vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms);
+
+/**
+ * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
+ *
+ * @param handle            The instance of VAD.
+ * @param data              An array of 16-bit signed audio samples.
+ * @return
+ *         - VAD_SILENCE if no voice
+ *         - VAD_SPEECH  if voice is detected
+ *
+ */
+vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);

 /**
 * @brief Free the VAD instance
--- a/include/esp32s3/esp_vadn_iface.h
+++ b/include/esp32s3/esp_vadn_iface.h
@ -1,6 +1,7 @@
 #pragma once
 #include "esp_vad.h"
 #include "stdint.h"
+#include "dl_lib_convq_queue.h"

 #ifdef __cplusplus
 extern "C" {
@ -98,6 +99,25 @@ typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model
 */
 typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);

+/**
+ * @brief Feed MFCC of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param cq An array of 16-bit MFCC.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq);
+
+/**
+ * @brief Get MFCC of an audio stream
+ *
+ * @param model The model object to query
+ * @return MFCC data
+ */
+typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
+
 /**
 * @brief Get the triggered channel index. Channel index starts from zero
 *
@ -133,6 +153,8 @@ typedef struct {
    esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
    esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
    esp_vadn_iface_op_detect_t detect;
+    esp_vadn_iface_op_detect_mfcc_t detect_mfcc;
+    esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data;
    esp_vadn_iface_op_clean_t clean;
    esp_vadn_iface_op_destroy_t destroy;
 } esp_vadn_iface_t;
--- a/include/esp32s3/esp_webrtc.h
+++ b/include/esp32s3/esp_webrtc.h
@ -0,0 +1,90 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_WEBRTC_H_
+#define _ESP_WEBRTC_H_
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+#include <stdint.h>
+#include "sr_ringbuf.h"
+#include "esp_log.h"
+#include "esp_agc.h"
+#include "esp_ns.h"
+
+#include "esp_heap_caps.h"
+
+typedef struct {
+    void* ns_handle;
+    void* agc_handle;
+    int frame_size;
+    int sample_rate;
+    int16_t *buff;
+    int16_t *out_data;
+    sr_ringbuf_handle_t rb;
+}webrtc_handle_t;
+
+/**
+ * @brief Creates an instance of webrtc.
+ * 
+ * @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms.
+ *
+ * @param frame_length_ms    The length of the audio processing
+ * @param ns_mode            The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive
+ * @param agc_mode           The model of AGC
+ * @param agc_gain           The gain of AGC. default is 9
+ * @param agc_target_level   The target level of AGC. default is -3 dbfs
+ * @param sample_rate        The sample rate of the audio. 
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of webrtc
+ */
+webrtc_handle_t* webrtc_create(
+    int frame_length_ms, 
+    int ns_mode, 
+    agc_mode_t agc_mode, 
+    int agc_gain, 
+    int agc_target_level, 
+    int sample_rate);
+
+/**
+ * @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression.
+ *
+ * @param handle        The instance of NS.
+ * @param in_data       An array of 16-bit signed audio samples.
+ * @param out_size      The sample size of output data
+ * @param enable_ns     Enable noise suppression
+ * @param enable_agc    Enable automatic gain control
+ *
+ * @return data after noise suppression
+ */
+int16_t* webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc);
+
+/**
+ * @brief Free the webrtc instance
+ *
+ * @param handle The instance of webrtc.
+ *
+ * @return None
+ *
+ */
+void webrtc_destroy(webrtc_handle_t *handle);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif //_ESP_NS_H_
--- a/include/esp32s3/esp_wn_iface.h
+++ b/include/esp32s3/esp_wn_iface.h
@ -1,5 +1,6 @@
 #pragma once
 #include "stdint.h"
+#include "dl_lib_convq_queue.h"

 #ifdef __cplusplus
 extern "C" {
@ -167,6 +168,25 @@ typedef void (*esp_wn_iface_op_clean_t)(model_iface_data_t *model);
 */
 typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model);

+/**
+ * @brief Feed MFCC of an audio stream to the vad model and detect whether is
+ * voice.
+ *
+ * @param model The model object to query
+ * @param cq An array of 16-bit MFCC.
+ * @return The index of wake words, return 0 if no wake word is detected, else
+ * the index of the wake words.
+ */
+typedef wakenet_state_t (*esp_wn_iface_op_detect_mfcc_t)(model_iface_data_t *model, int16_t *samples, dl_convq_queue_t *cq);
+
+/**
+ * @brief Get MFCC of an audio stream
+ *
+ * @param model The model object to query
+ * @return MFCC data
+ */
+typedef dl_convq_queue_t* (*esp_wn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
+

 /**
 * This structure contains the functions used to do operations on a wake word detection model.
@ -184,6 +204,8 @@ typedef struct {
    esp_wn_iface_op_get_triggered_channel_t  get_triggered_channel;
    esp_wn_iface_op_get_vol_gain_t get_vol_gain;
    esp_wn_iface_op_detect_t detect;
+    esp_wn_iface_op_detect_mfcc_t detect_mfcc;
+    esp_wn_iface_op_get_mfcc_data_t get_mfcc_data;
    esp_wn_iface_op_clean_t clean;
    esp_wn_iface_op_destroy_t destroy;
 } esp_wn_iface_t;
--- a/lib/esp32/libc_speech_features.a
+++ b/lib/esp32/libc_speech_features.a
--- a/lib/esp32/libdl_lib.a
+++ b/lib/esp32/libdl_lib.a
--- a/lib/esp32/libesp_audio_front_end.a
+++ b/lib/esp32/libesp_audio_front_end.a
--- a/lib/esp32/libesp_audio_processor.a
+++ b/lib/esp32/libesp_audio_processor.a
--- a/lib/esp32/libflite_g2p.a
+++ b/lib/esp32/libflite_g2p.a
--- a/lib/esp32/libfst.a
+++ b/lib/esp32/libfst.a
--- a/lib/esp32/libhufzip.a
+++ b/lib/esp32/libhufzip.a
--- a/lib/esp32/libmultinet.a
+++ b/lib/esp32/libmultinet.a
--- a/lib/esp32/libnsnet.a
+++ b/lib/esp32/libnsnet.a
--- a/lib/esp32/libvadnet.a
+++ b/lib/esp32/libvadnet.a
--- a/lib/esp32/libwakenet.a
+++ b/lib/esp32/libwakenet.a
--- a/lib/esp32p4/libc_speech_features.a
+++ b/lib/esp32p4/libc_speech_features.a
--- a/lib/esp32p4/libdl_lib.a
+++ b/lib/esp32p4/libdl_lib.a
--- a/lib/esp32p4/libesp_audio_front_end.a
+++ b/lib/esp32p4/libesp_audio_front_end.a
--- a/lib/esp32p4/libesp_audio_processor.a
+++ b/lib/esp32p4/libesp_audio_processor.a
--- a/lib/esp32p4/libflite_g2p.a
+++ b/lib/esp32p4/libflite_g2p.a
--- a/lib/esp32p4/libfst.a
+++ b/lib/esp32p4/libfst.a
--- a/lib/esp32p4/libhufzip.a
+++ b/lib/esp32p4/libhufzip.a
--- a/lib/esp32p4/libmultinet.a
+++ b/lib/esp32p4/libmultinet.a
--- a/lib/esp32p4/libnsnet.a
+++ b/lib/esp32p4/libnsnet.a
--- a/lib/esp32p4/libvadnet.a
+++ b/lib/esp32p4/libvadnet.a
--- a/lib/esp32p4/libwakenet.a
+++ b/lib/esp32p4/libwakenet.a
--- a/lib/esp32s3/libc_speech_features.a
+++ b/lib/esp32s3/libc_speech_features.a
--- a/lib/esp32s3/libdl_lib.a
+++ b/lib/esp32s3/libdl_lib.a
--- a/lib/esp32s3/libesp_audio_front_end.a
+++ b/lib/esp32s3/libesp_audio_front_end.a
--- a/lib/esp32s3/libesp_audio_processor.a
+++ b/lib/esp32s3/libesp_audio_processor.a
--- a/lib/esp32s3/libmultinet.a
+++ b/lib/esp32s3/libmultinet.a
--- a/lib/esp32s3/libnsnet.a
+++ b/lib/esp32s3/libnsnet.a
--- a/lib/esp32s3/libvadnet.a
+++ b/lib/esp32s3/libvadnet.a
--- a/lib/esp32s3/libwakenet.a
+++ b/lib/esp32s3/libwakenet.a
--- a/model/vadnet_model/vadnet1_medium/_MODEL_INFO_
+++ b/model/vadnet_model/vadnet1_medium/_MODEL_INFO_
@ -1 +1 @@
-vadnet1_mediumv1_Speech_3_0.5_0.1
+vadnet1_mediumv1_Speech_1_0.5_0.1
--- a/src/esp_process_sdkconfig.c
+++ b/src/esp_process_sdkconfig.c
@ -958,4 +958,4 @@ end:
    esp_mn_commands_print();

    return esp_mn_commands_update();
-}
+}
--- a/test_apps/esp-sr/main/CMakeLists.txt
+++ b/test_apps/esp-sr/main/CMakeLists.txt
@ -8,7 +8,7 @@ set(srcs

 idf_component_register(SRCS ${srcs}
                    INCLUDE_DIRS "." "samples"
-                    REQUIRES unity esp-sr
+                    REQUIRES unity esp-sr esp_timer
                    WHOLE_ARCHIVE)

 target_compile_options(${COMPONENT_LIB} PRIVATE "-Wno-format")
--- a/test_apps/esp-sr/main/test_afe.cpp
+++ b/test_apps/esp-sr/main/test_afe.cpp
@ -12,7 +12,7 @@
 #include <limits.h>
 #include "unity.h"
 #include "esp_log.h"
-
+#include "esp_timer.h"
 #include "model_path.h"
 #include "esp_wn_iface.h"
 #include "esp_wn_models.h"
@ -23,152 +23,187 @@
 #if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
 #include "esp_nsn_models.h"
 #include "esp_nsn_iface.h"
-#include "esp_vadn_models.h"
-#include "esp_vadn_iface.h"
 #endif

 #define ARRAY_SIZE_OFFSET                   8       // Increase this if audio_sys_get_real_time_stats returns ESP_ERR_INVALID_SIZE
 #define AUDIO_SYS_TASKS_ELAPSED_TIME_MS     1000    // Period of stats measurement

 static const char *TAG = "AFE_TEST";
-static volatile int s_cpu_test_task_flag = 0;
-static esp_afe_sr_data_t *afe_data = NULL;
-
-static int total_ram_size_before = 0;
-static int internal_ram_size_before = 0;
-static int psram_size_before = 0;
+static int detect_cnt = 0;
+static int fetch_task_flag = 0;


-
-#if (CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID && CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS)
-const static char *task_state[] = {
-    "Running",
-    "Ready",
-    "Blocked",
-    "Suspended",
-    "Deleted"
-};
-
-/** @brief
-* "Extr": Allocated task stack from psram, "Intr": Allocated task stack from internel
-*/
-const static char *task_stack[] = {"Extr", "Intr"};
-#endif
-
-
-TEST_CASE(">>>>>>>> audio_front_end SR create/destroy API & memory leak <<<<<<<<", "[afe_sr]")
+void test_afe_by_config(afe_config_t *afe_config, int frame_num, int* memory, float* cpu, int idx)
 {
-    int audio_chunksize = 0;
-    int16_t *feed_buff = NULL;
+    int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
+    int start_internal_size = heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
+    int first_end_size = 0;
+    int end_size = 0;
+    int mem_leak = 0;
+    uint32_t feed_cpu_time = 0;
+    uint32_t fetch_cpu_time = 0;
+    uint32_t start=0, end = 0;
+    int loop = 3;
+    int feed_chunksize = 0;
+    int create_size = 0;
+    int create_internal_size = 0;

-    for (int aec_init = 0; aec_init < 2; aec_init++) {
-        for (int se_init = 0; se_init < 2; se_init++) {
-            for (int vad_init = 0; vad_init < 2; vad_init++) {
-                for (int wakenet_init = 0; wakenet_init < 2; wakenet_init++) {
-                    printf("aec_init: %d, se_init: %d, vad_init: %d, wakenet_init: %d\n", aec_init, se_init, vad_init, wakenet_init);
+    for (int i=0; i<loop; i++) {
+        // init config and handle
+        esp_afe_sr_iface_t *afe_handle = esp_afe_handle_from_config(afe_config);
+        // afe_config_print(afe_config);
+        esp_afe_sr_data_t *afe_data = afe_handle->create_from_config(afe_config);

-                    int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
-                    int start_internal_size = heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
-                    srmodel_list_t *models = esp_srmodel_init("model");
-                    char *model_name = esp_srmodel_filter(models, ESP_WN_PREFIX, NULL);
-                    char *vad_model_name = NULL;
-#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
-                    vad_model_name = esp_srmodel_filter(models, ESP_VADN_PREFIX, NULL);
-#endif
+        create_size = start_size - heap_caps_get_free_size(MALLOC_CAP_8BIT);
+        create_internal_size = start_internal_size - heap_caps_get_free_size(MALLOC_CAP_INTERNAL);

-                    esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)&ESP_AFE_SR_HANDLE;
-                    afe_config_t afe_config = AFE_CONFIG_DEFAULT();
-                    afe_config.aec_init = aec_init;
-                    afe_config.se_init = se_init;
-                    afe_config.vad_init = vad_init;
-                    afe_config.wakenet_init = wakenet_init;
-                    afe_config.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM;
-                    afe_config.wakenet_model_name = model_name;
-                    afe_config.voice_communication_init = false;
-                    afe_config.vad_model_name = vad_model_name;
-                    if (vad_model_name) {
-                        printf("vad_model_name:%s\n", vad_model_name);
-                    }
+        // run afe feed
+        feed_chunksize = afe_handle->get_feed_chunksize(afe_data);
+        int feed_nch = afe_handle->get_feed_channel_num(afe_data);

-                    // test model loading time
-                    struct timeval tv_start, tv_end;
-                    gettimeofday(&tv_start, NULL);
-                    afe_data = afe_handle->create_from_config(&afe_config);
-                    gettimeofday(&tv_end, NULL);
-                    int tv_ms = (tv_end.tv_sec - tv_start.tv_sec) * 1000 + (tv_end.tv_usec - tv_start.tv_usec) / 1000;
-                    printf("create latency:%d ms\n", tv_ms);
+        int16_t *feed_buff = (int16_t *) malloc(feed_chunksize * sizeof(int16_t) * feed_nch);
+        start = esp_timer_get_time();
+        for (int j=0; j<frame_num; j++) {
+            afe_handle->feed(afe_data, feed_buff);
+        }
+        end = esp_timer_get_time();
+        feed_cpu_time += end - start;

-                    // test model memory concumption
-                    int create_size = start_size - heap_caps_get_free_size(MALLOC_CAP_8BIT);
-                    int create_internal_size = start_internal_size - heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
-                    printf("Internal RAM: %d, PSRAM:%d\n", create_internal_size, create_size - create_internal_size);
-                    afe_handle->destroy(afe_data);
-                    esp_srmodel_deinit(models);
+        //run afe fetch
+        start = esp_timer_get_time();
+        while(1) {
+            afe_fetch_result_t *res = afe_handle->fetch_with_delay(afe_data, 1 / portTICK_PERIOD_MS);
+            if (res->ret_value != ESP_OK) {
+                break;
+            }
+        }
+        end = esp_timer_get_time();
+        fetch_cpu_time += end - start;
+        free(feed_buff);
+        afe_handle->destroy(afe_data);
+        end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);

-                    // test memory leak
-                    int first_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
-                    int last_end_size = first_end_size;
-                    int mem_leak = start_size - last_end_size;
-                    printf("create&destroy times:%d, memory leak:%d\n", 1, mem_leak);
+        if (i==0) {
+            first_end_size = end_size;
+        } 
+        mem_leak = start_size - end_size;
+        ESP_LOGI(TAG, "create&destroy times:%d, memory leak:%d\n", i, mem_leak);
+    }
+    uint32_t feed_data_time = loop * frame_num * feed_chunksize / 16 * 1000; // us
+    memory[idx*2] = create_internal_size;
+    memory[idx*2+1] = create_size - create_internal_size;
+    cpu[idx*2] = feed_cpu_time*1.0/feed_data_time;
+    cpu[idx*2+1] = fetch_cpu_time*1.0/feed_data_time;
+    printf("Internal RAM: %d, PSRAM:%d, feed cpu loading:%f, fetch cpu loading:%f\n", 
+            memory[idx*2], memory[idx*2+1], cpu[idx*2], cpu[idx*2+1]);
+    TEST_ASSERT_EQUAL(true, mem_leak < 1000 && end_size == first_end_size);
+}

-                    for (int i = 0; i < 6; i++) {
-                        printf("init partition ...\n");
-                        models = esp_srmodel_init("model");
-                        model_name = esp_srmodel_filter(models, ESP_WN_PREFIX, NULL);
-#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
-                        vad_model_name = esp_srmodel_filter(models, ESP_VADN_PREFIX, NULL);
-#endif
-                        afe_config.wakenet_model_name = model_name;
-                        afe_config.vad_model_name = vad_model_name;
+TEST_CASE(">>>>>>>> AFE create/destroy API & memory leak <<<<<<<<", "[afe]")
+{
+    const char *input_format[6] = {"MR", "MMNR"};
+    afe_type_t afe_type[2] = {AFE_TYPE_SR, AFE_TYPE_VC};
+    afe_mode_t afe_mode[2] = {AFE_MODE_LOW_COST, AFE_MODE_HIGH_PERF};
+    int count = 0;
+    int memory[512];
+    float cpu[512];

-                        printf("create ...\n");
-                        afe_data = afe_handle->create_from_config(&afe_config);
-
-                        audio_chunksize = afe_handle->get_feed_chunksize(afe_data);
-                        feed_buff = (int16_t *) malloc(audio_chunksize * sizeof(int16_t) * afe_config.pcm_config.total_ch_num);
-                        assert(feed_buff);
-
-                        afe_handle->feed(afe_data, feed_buff);
-                        printf("destroy ...\n");
-                        afe_handle->destroy(afe_data);
-                        afe_data = NULL;
-                        if (feed_buff) {
-                            free(feed_buff);
-                            feed_buff = NULL;
+    // test all setting
+    srmodel_list_t *models = esp_srmodel_init("model");
+    for (int format_id=0; format_id<2; format_id++) {
+        for (int type_id=0; type_id<2; type_id++) {
+            for (int mode_id=0; mode_id<2; mode_id++) {
+                for (int aec_init = 0; aec_init < 2; aec_init++) {
+                    for (int se_init = 0; se_init < 2; se_init++) {
+                        for (int ns_init = 0; ns_init < 2; ns_init++) {
+                            for (int vad_init = 0; vad_init < 2; vad_init++) {
+                                for (int wakenet_init = 0; wakenet_init < 2; wakenet_init++) {
+                                    printf("format: %s, type: %d, mode: %d, memory size:%d %d\n", 
+                                    input_format[format_id], afe_type[type_id], afe_mode[mode_id], heap_caps_get_free_size(MALLOC_CAP_8BIT), count);
+                                    afe_config_t *afe_config = afe_config_init(input_format[format_id], models, afe_type[type_id], afe_mode[mode_id]);
+                                    afe_config->aec_init = aec_init;
+                                    afe_config->se_init = se_init;
+                                    afe_config->ns_init = ns_init;
+                                    afe_config->vad_init = vad_init;
+                                    afe_config->wakenet_init = wakenet_init;
+                                    test_afe_by_config(afe_config, 4, memory, cpu, count);
+                                    afe_config_free(afe_config);
+                                    count++;
+                                }
+                            }
                        }
-                        esp_srmodel_deinit(models);
-
-                        vTaskDelay(100 / portTICK_PERIOD_MS);
-                        last_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
-                        mem_leak = start_size - last_end_size;
-                        printf("create&destroy times:%d, memory leak:%d\n", i + 2, mem_leak);
                    }
-
-                    TEST_ASSERT_EQUAL(true, (mem_leak) < 1000 && last_end_size == first_end_size);
                }
            }
        }
    }
+    for (int idx=0; idx<256; idx++) {
+        printf("Internal RAM: %d, PSRAM:%d, feed cpu loading:%f, fetch cpu loading:%f\n", 
+            memory[idx*2], memory[idx*2+1], cpu[idx*2], cpu[idx*2+1]);
+    }
+    printf("AFE create/destroy API & memory leak test done\n");
 }

+TEST_CASE(">>>>>>>> AFE default setting <<<<<<<<", "[afe_benchmark]")
+{
+    const char *input_format[6] = {"MR", "MMNR"};
+    afe_type_t afe_type[2] = {AFE_TYPE_SR, AFE_TYPE_VC};
+    afe_mode_t afe_mode[2] = {AFE_MODE_LOW_COST, AFE_MODE_HIGH_PERF};
+    int count = 0;
+    int memory[16];
+    float cpu[16];
+
+    // test all setting
+    srmodel_list_t *models = esp_srmodel_init("model");
+    for (int format_id=0; format_id<2; format_id++) {
+        for (int type_id=0; type_id<2; type_id++) {
+            for (int mode_id=0; mode_id<2; mode_id++) {
+                printf("format: %s, type: %d, mode: %d, memory size:%d %d\n", 
+                input_format[format_id], afe_type[type_id], afe_mode[mode_id], heap_caps_get_free_size(MALLOC_CAP_8BIT), count);
+                afe_config_t *afe_config = afe_config_init(input_format[format_id], models, afe_type[type_id], afe_mode[mode_id]);
+                test_afe_by_config(afe_config, 8, memory, cpu, count);
+                afe_config_free(afe_config);
+                count++;
+            }
+        }
+    }
+    count = 0;
+    for (int format_id=0; format_id<2; format_id++) {
+        for (int type_id=0; type_id<2; type_id++) {
+            for (int mode_id=0; mode_id<2; mode_id++) {
+
+                printf("--------format: %s, type: %s, mode: %s------------\n", input_format[format_id], type_id==0? "SR": "VC", mode_id==0? "LOW_COST": "HIGH_PERF");
+                printf("Internal RAM: %d, PSRAM:%d, feed cpu loading:%f, fetch cpu loading:%f\n", 
+                    memory[count*2], memory[count*2+1], cpu[count*2], cpu[count*2+1]);
+                count++;
+            }
+        }
+    }
+    printf("test done\n");
+}
+
+
 void test_feed_Task(void *arg)
 {
-    int sample_per_ms = 16;
-    // esp_afe_sr_iface_t *afe_handle = &ESP_AFE_SR_HANDLE;
-    esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)arg;
+    afe_task_into_t *afe_task_info = (afe_task_into_t *)arg;
+    esp_afe_sr_iface_t *afe_handle = afe_task_info->afe_handle;
+    esp_afe_sr_data_t *afe_data = afe_task_info->afe_data;
+
    int feed_chunksize = afe_handle->get_feed_chunksize(afe_data);
-    int total_nch = afe_handle->get_total_channel_num(afe_data);
-    int16_t *i2s_buff = (int16_t *) malloc(feed_chunksize * sizeof(int16_t) * total_nch);
+    int feed_nch = afe_handle->get_feed_channel_num(afe_data);
+    int sample_per_ms = afe_handle->get_samp_rate(afe_data) / 1000;
+    int16_t *i2s_buff = (int16_t *) malloc(feed_chunksize * sizeof(int16_t) * feed_nch);
    assert(i2s_buff);
    ESP_LOGI(TAG, "feed task start\n");
-    // FILE *fp = fopen("/sdcard/out", "w");
-    // if (fp == NULL) printf("can not open file\n");
-
-    while (s_cpu_test_task_flag) {
-        // FatfsComboWrite(i2s_buff, audio_chunksize * I2S_CHANNEL_NUM * sizeof(int16_t), 1, fp);
+    int count = 0;

+    while (1) {
+        count ++;
        afe_handle->feed(afe_data, i2s_buff);
        vTaskDelay((feed_chunksize / sample_per_ms) / portTICK_PERIOD_MS);
+        if (count > 100) {
+            break;
+        }
    }
    if (i2s_buff) {
        free(i2s_buff);
@ -177,346 +212,89 @@ void test_feed_Task(void *arg)
    vTaskDelete(NULL);
 }

-void test_detect_Task(void *arg)
+void test_fetch_Task(void *arg)
 {
    // esp_afe_sr_iface_t *afe_handle = &ESP_AFE_SR_HANDLE;
-    esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)arg;
-    int fetch_chunksize = afe_handle->get_fetch_chunksize(afe_data);
-    int16_t *buff = (int16_t *) malloc(fetch_chunksize * sizeof(int16_t));
-    assert(buff);
-    ESP_LOGI(TAG, "------------detect start------------\n");
-
-    // FILE *fp = fopen("/sdcard/out1", "w");
-    // if (fp == NULL) printf("can not open file\n");
-
-    while (s_cpu_test_task_flag) {
+    afe_task_into_t *afe_task_info = (afe_task_into_t *)arg;
+    esp_afe_sr_iface_t *afe_handle = afe_task_info->afe_handle;
+    esp_afe_sr_data_t *afe_data = afe_task_info->afe_data;
+    detect_cnt = 0;
+    fetch_task_flag = 1;
+    while (1) {
        afe_fetch_result_t* res = afe_handle->fetch(afe_data); 
        if (!res || res->ret_value == ESP_FAIL) {
-            printf("fetch error!\n");
            break;
        }

        if (res->wakeup_state == WAKENET_DETECTED) {
-            ESP_LOGI(TAG, "wakeword detected\n");
-        }
-        if (res->wakeup_state == WAKENET_CHANNEL_VERIFIED) {
-            ESP_LOGI(TAG, "AFE_FETCH_CHANNEL_VERIFIED\n");
+            detect_cnt++;
        }
    }
-    if (buff) {
-        free(buff);
-    }
+
+    // TEST_ASSERT_EQUAL(true, detect_cnt > 0);
    ESP_LOGI(TAG, "detect task quit\n");
+    fetch_task_flag = 0;
    vTaskDelete(NULL);
 }

-esp_err_t audio_sys_get_real_time_stats(void)
+TEST_CASE("afe performance test (1ch)", "[afe_perf]")
 {
-#if (CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID && CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS)
-    TaskStatus_t *start_array = NULL, *end_array = NULL;
-    UBaseType_t start_array_size, end_array_size;
-    uint32_t start_run_time, end_run_time;
-    uint32_t task_elapsed_time, percentage_time;
-    esp_err_t ret;
+    const char *input_format = "MR";
+    afe_type_t afe_type = AFE_TYPE_VC;
+    afe_mode_t afe_model[2] = {AFE_MODE_HIGH_PERF, AFE_MODE_LOW_COST};

-    // Allocate array to store current task states
-    start_array_size = uxTaskGetNumberOfTasks() + ARRAY_SIZE_OFFSET;
-    start_array = (TaskStatus_t*) malloc(sizeof(TaskStatus_t) * start_array_size);
-    assert(start_array);
-    // Get current task states
-    start_array_size = uxTaskGetSystemState(start_array, start_array_size, &start_run_time);
-    if (start_array_size == 0) {
-        ESP_LOGE(TAG, "Insufficient array size for uxTaskGetSystemState. Trying increasing ARRAY_SIZE_OFFSET");
-        ret = ESP_FAIL;
-        if (start_array) {
-            free(start_array);
-            start_array = NULL;
-        }
-        if (end_array) {
-            free(end_array);
-            end_array = NULL;
-        }
-        return ret;
-    }
+    // test all setting
+    srmodel_list_t *models = esp_srmodel_init("model");

-    vTaskDelay(pdMS_TO_TICKS(AUDIO_SYS_TASKS_ELAPSED_TIME_MS));
-
-    // Allocate array to store tasks states post delay
-    end_array_size = uxTaskGetNumberOfTasks() + ARRAY_SIZE_OFFSET;
-    end_array = (TaskStatus_t*) malloc(sizeof(TaskStatus_t) * end_array_size);
-    assert(end_array);
-
-    // Get post delay task states
-    end_array_size = uxTaskGetSystemState(end_array, end_array_size, &end_run_time);
-    if (end_array_size == 0) {
-        ESP_LOGE(TAG, "Insufficient array size for uxTaskGetSystemState. Trying increasing ARRAY_SIZE_OFFSET");
-        ret = ESP_FAIL;
-        if (start_array) {
-            free(start_array);
-            start_array = NULL;
-        }
-        if (end_array) {
-            free(end_array);
-            end_array = NULL;
-        }
-        return ret;
-    }
-
-    // Calculate total_elapsed_time in units of run time stats clock period.
-    uint32_t total_elapsed_time = (end_run_time - start_run_time);
-    if (total_elapsed_time == 0) {
-        ESP_LOGE(TAG, "Delay duration too short. Trying increasing AUDIO_SYS_TASKS_ELAPSED_TIME_MS");
-        ret = ESP_FAIL;
-        if (start_array) {
-            free(start_array);
-            start_array = NULL;
-        }
-        if (end_array) {
-            free(end_array);
-            end_array = NULL;
-        }
-        return ret;
-    }
-
-    ESP_LOGI(TAG, "| Task              | Run Time    | Per | Prio | HWM       | State   | CoreId   | Stack ");
-
-    // Match each task in start_array to those in the end_array
-    for (int i = 0; i < start_array_size; i++) {
-        for (int j = 0; j < end_array_size; j++) {
-            if (start_array[i].xHandle == end_array[j].xHandle) {
-
-                task_elapsed_time = end_array[j].ulRunTimeCounter - start_array[i].ulRunTimeCounter;
-                percentage_time = (task_elapsed_time * 100UL) / (total_elapsed_time * portNUM_PROCESSORS);
-                ESP_LOGI(TAG, "| %-17s | %-11d |%2d%%  | %-4u | %-9u | %-7s | %-8x | %s",
-                                start_array[i].pcTaskName, task_elapsed_time, percentage_time, start_array[i].uxCurrentPriority,
-                                start_array[i].usStackHighWaterMark, task_state[(start_array[i].eCurrentState)],
-                                start_array[i].xCoreID, task_stack[esp_ptr_internal(pxTaskGetStackStart(start_array[i].xHandle))]);
-
-                // Mark that task have been matched by overwriting their handles
-                start_array[i].xHandle = NULL;
-                end_array[j].xHandle = NULL;
-                break;
+    for (int mode_id=0; mode_id<2; mode_id++) {
+        afe_config_t *afe_config = afe_config_init(input_format, models, afe_type, afe_model[mode_id]);
+        if (afe_config->wakenet_init && afe_config->wakenet_model_name) {
+            esp_afe_sr_iface_t *afe_handle = esp_afe_handle_from_config(afe_config);
+            esp_afe_sr_data_t *afe_data = afe_handle->create_from_config(afe_config);
+            afe_task_into_t task_info;
+            task_info.afe_data = afe_data;
+            task_info.afe_handle = afe_handle;
+            task_info.feed_task = NULL;
+            task_info.fetch_task = NULL;
+            fetch_task_flag = 1;
+            xTaskCreatePinnedToCore(test_feed_Task, "feed_task", 8 * 1024, (void *)(&task_info), 5, &task_info.feed_task, 0);
+            xTaskCreatePinnedToCore(test_fetch_Task, "fetch_task", 8 * 1024, (void *)(&task_info), 5, &task_info.fetch_task, 0);
+            while (fetch_task_flag) {
+                vTaskDelay(32 / portTICK_PERIOD_MS);
            }
        }
+        afe_config_free(afe_config);
    }
-
-    // Print unmatched tasks
-    for (int i = 0; i < start_array_size; i++) {
-        if (start_array[i].xHandle != NULL) {
-            ESP_LOGI(TAG, "| %s | Deleted", start_array[i].pcTaskName);
-        }
-    }
-    for (int i = 0; i < end_array_size; i++) {
-        if (end_array[i].xHandle != NULL) {
-            ESP_LOGI(TAG, "| %s | Created", end_array[i].pcTaskName);
-        }
-    }
-    printf("\n");
-    ret = ESP_OK;
-
-    return ret;
-#else
-    ESP_LOGW(TAG, "Please enbale `CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID` and `CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS` in menuconfig");
-    return ESP_FAIL;
-#endif
-}
-
-void test_print_cpuloading(void *arg)
-{
-    while (s_cpu_test_task_flag) {
-        audio_sys_get_real_time_stats();
-        int total_ram_size_after = heap_caps_get_free_size(MALLOC_CAP_8BIT);
-        int internal_ram_size_after = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
-        int psram_size_after = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
-
-        ESP_LOGI(TAG, "total ram consume: %d KB", (total_ram_size_before - total_ram_size_after)/1024);
-        ESP_LOGI(TAG, "internal ram consume: %d KB", (internal_ram_size_before - internal_ram_size_after)/1024);
-        ESP_LOGI(TAG, "psram consume: %d KB\n\n", (psram_size_before - psram_size_after)/1024);
-    }
-    vTaskDelete(NULL);
-}
-
-TEST_CASE("audio_front_end SR cpu loading and memory info", "[afe_sr]")
-{
-    srmodel_list_t *models = esp_srmodel_init("model");
-    if (models!=NULL) {
-        for (int i=0; i < models->num; i++) {
-            printf("Load: %s\n", models->model_name[i]);
-        }
-    }
-    char *wn_name = esp_srmodel_filter(models, ESP_WN_PREFIX, NULL);
-    printf("wn_name: %s\n", wn_name);
-
-    total_ram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT);
-    internal_ram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
-    psram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
-
-    esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)&ESP_AFE_SR_HANDLE;
-    afe_config_t afe_config = AFE_CONFIG_DEFAULT();
-    afe_config.wakenet_model_name = wn_name;
-
-    afe_data = afe_handle->create_from_config(&afe_config);
-    if (!afe_data) {
-        printf("afe_data is null!\n");
-        return;
-    }
-
-    s_cpu_test_task_flag = 1;
-    xTaskCreatePinnedToCore(&test_feed_Task, "feed", 8 * 1024, (void *)afe_handle, 5, NULL, 0);
-    xTaskCreatePinnedToCore(&test_detect_Task, "detect", 8 * 1024, (void *)afe_handle, 5, NULL, 1);
-    xTaskCreatePinnedToCore(&test_print_cpuloading, "cpuloading", 4 * 1024, NULL, 5, NULL, 1);
-
-    vTaskDelay(10000 / portTICK_PERIOD_MS);
-    s_cpu_test_task_flag = 0;
-
-    vTaskDelay(2000 / portTICK_PERIOD_MS);
-    ESP_LOGI(TAG, "destroy\n");
-    afe_handle->destroy(afe_data);
-    afe_data = NULL;
    esp_srmodel_deinit(models);
-    ESP_LOGI(TAG, "successful\n");
 }

-
-
-/******************************************** Divide VC Test ********************************************/
-
-
-
-TEST_CASE("audio_front_end VC create/destroy API & memory leak", "[afe_vc]")
+TEST_CASE("afe performance test (2ch)", "[afe_perf]")
 {
-    int start_total_mem_size = 0;
-    int start_internal_mem_size = 0;
-    int start_spiram_mem_size = 0;
-    int end_total_mem_size = 0;
-    int end_internal_mem_size = 0;
-    int end_spiram_mem_size = 0;
+    const char *input_format = "MMR";
+    afe_type_t afe_type = AFE_TYPE_VC;
+    afe_mode_t afe_model[2] = {AFE_MODE_HIGH_PERF, AFE_MODE_LOW_COST};

-    int audio_chunksize = 0;
-    int16_t *feed_buff = NULL;
+    // test all setting
+    srmodel_list_t *models = esp_srmodel_init("model");

-    esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)&ESP_AFE_VC_HANDLE;
-    afe_config_t afe_config = AFE_CONFIG_DEFAULT();
-    afe_config.wakenet_init = false;
-    afe_config.voice_communication_init = true;
-
-    for (int aec_init = 0; aec_init < 2; aec_init++) {
-        for (int se_init = 0; se_init < 2; se_init++) {
-            for (int vad_init = 0; vad_init < 2; vad_init++) {
-                for (int voice_communication_agc_init = 0; voice_communication_agc_init < 2; voice_communication_agc_init++) {
-                    #if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
-                        for (int afe_ns_mode = 0; afe_ns_mode < 2; afe_ns_mode++) {
-                    #else
-                            int afe_ns_mode = NS_MODE_SSP;
-                    #endif
-                            printf("aec_init: %d, se_init: %d, vad_init: %d, voice_communication_agc_init: %d, afe_ns_mode: %d\n", aec_init, se_init, vad_init, voice_communication_agc_init, afe_ns_mode);
-                            afe_config.aec_init = aec_init;
-                            afe_config.se_init = se_init;
-                            afe_config.vad_init = vad_init;
-                            afe_config.voice_communication_agc_init = voice_communication_agc_init;
-                            afe_config.afe_ns_mode = (afe_ns_mode_t)afe_ns_mode;
-
-                            //start_total_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
-                            //start_internal_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
-                            //start_spiram_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
-
-                            for (int i = 0; i < 2; i++) {
-                                printf("index: %d\n", i);
-                                vTaskDelay(500 / portTICK_PERIOD_MS);
-                                start_total_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
-                                start_internal_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
-                                start_spiram_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
-                                srmodel_list_t *models = esp_srmodel_init("model");
-                                char *nsnet_name = NULL;
-                            #if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
-                                nsnet_name = esp_srmodel_filter(models, ESP_NSNET_PREFIX, NULL);
-                            #endif
-                                printf("nsnet_name: %s\n", nsnet_name ? nsnet_name : "");
-                                afe_config.afe_ns_model_name = nsnet_name;
-                                afe_data = afe_handle->create_from_config(&afe_config);
-                                if (!afe_data) {
-                                    printf("afe_data is null\n");
-                                    continue;
-                                }
-
-                                audio_chunksize = afe_handle->get_feed_chunksize(afe_data);
-                                feed_buff = (int16_t *) malloc(audio_chunksize * sizeof(int16_t) * afe_config.pcm_config.total_ch_num);
-                                assert(feed_buff);
-
-                                afe_handle->feed(afe_data, feed_buff);
-                                afe_handle->destroy(afe_data);
-                                afe_data = NULL;
-                                if (feed_buff) {
-                                    free(feed_buff);
-                                    feed_buff = NULL;
-                                }
-
-                                esp_srmodel_deinit(models);
-                                vTaskDelay(1000 / portTICK_PERIOD_MS);
-                                end_total_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
-                                end_internal_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
-                                end_spiram_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
-
-                                printf("memory leak: %d\n", start_total_mem_size - end_total_mem_size);
-                                if (i > 0) {     // skip index = 0
-                                    TEST_ASSERT_EQUAL(start_internal_mem_size, end_internal_mem_size);
-                                    TEST_ASSERT_EQUAL(start_spiram_mem_size, end_spiram_mem_size);
-                                    TEST_ASSERT_EQUAL(start_total_mem_size, end_total_mem_size);
-                                } else {
-                                    TEST_ASSERT_EQUAL(true, (start_total_mem_size - end_total_mem_size) < 1000);
-                                }
-                            }
-                    #if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
-                        }
-                    #endif
-                }
+    for (int mode_id=0; mode_id<2; mode_id++) {
+        afe_config_t *afe_config = afe_config_init(input_format, models, afe_type, afe_model[mode_id]);
+        if (afe_config->wakenet_init && afe_config->wakenet_model_name) {
+            esp_afe_sr_iface_t *afe_handle = esp_afe_handle_from_config(afe_config);
+            esp_afe_sr_data_t *afe_data = afe_handle->create_from_config(afe_config);
+            afe_task_into_t task_info;
+            task_info.afe_data = afe_data;
+            task_info.afe_handle = afe_handle;
+            task_info.feed_task = NULL;
+            task_info.fetch_task = NULL;
+            fetch_task_flag = 1;
+            xTaskCreatePinnedToCore(&test_feed_Task, "feed_task", 8 * 1024, (void *)(&task_info), 5, &task_info.feed_task, 0);
+            xTaskCreatePinnedToCore(&test_fetch_Task, "fetch_task", 8 * 1024, (void *)(&task_info), 5, &task_info.fetch_task, 0);
+            while (fetch_task_flag) {
+                vTaskDelay(32 / portTICK_PERIOD_MS);
            }
        }
+        afe_config_free(afe_config);
    }
-}
-
-TEST_CASE("audio_front_end VC cpu loading and memory info", "[afe_vc]")
-{
-    total_ram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT);
-    internal_ram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
-    psram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
-    srmodel_list_t *models = esp_srmodel_init("model");
-    char *nsnet_name = NULL;
-#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
-    nsnet_name = esp_srmodel_filter(models, ESP_NSNET_PREFIX, NULL);
-#endif
-    printf("nsnet_name: %s\n", nsnet_name ? nsnet_name : "");
-
-    esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)&ESP_AFE_VC_HANDLE;
-    afe_config_t afe_config = AFE_CONFIG_DEFAULT();
-    afe_config.wakenet_init = false;
-    afe_config.voice_communication_init = true;
-    afe_config.voice_communication_agc_init = true;
-#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
-    afe_config.afe_ns_mode = NS_MODE_NET;
-#else
-    afe_config.afe_ns_mode = NS_MODE_SSP;
-#endif
-    afe_config.afe_ns_model_name = nsnet_name;
-
-    afe_data = afe_handle->create_from_config(&afe_config);
-    if (!afe_data) {
-        printf("afe_data is null!\n");
-        return;
-    }
-
-    s_cpu_test_task_flag = 1;
-    xTaskCreatePinnedToCore(&test_feed_Task, "feed", 8 * 1024, (void *)afe_handle, 5, NULL, 0);
-    xTaskCreatePinnedToCore(&test_detect_Task, "detect", 8 * 1024, (void *)afe_handle, 5, NULL, 1);
-    xTaskCreatePinnedToCore(&test_print_cpuloading, "cpuloading", 4 * 1024, NULL, 5, NULL, 1);
-
-    vTaskDelay(20000 / portTICK_PERIOD_MS);
-    s_cpu_test_task_flag = 0;
-
-    vTaskDelay(2000 / portTICK_PERIOD_MS);
-    ESP_LOGI(TAG, "destroy\n");
-    afe_handle->destroy(afe_data);
    esp_srmodel_deinit(models);
-    afe_data = NULL;
-    ESP_LOGI(TAG, "successful\n");
-}
+}
--- a/test_apps/esp-sr/pytest_esp_sr.py
+++ b/test_apps/esp-sr/pytest_esp_sr.py
@ -35,7 +35,6 @@ def test_multinet_p4(dut: Dut)-> None:
@pytest.mark.parametrize(
    'config',
    [
-        'mn5q8_en',
        'wn9_hilexin',
    ],
 )
@ -47,8 +46,7 @@ def test_wakenet(dut: Dut)-> None:
@pytest.mark.parametrize(
    'config',
    [
-        'p4_mn7_en',
-        'p4_nsnet2',
+        'p4_wn9_hilexin',
    ],
 )
 def test_wakenet_p4(dut: Dut)-> None:
@ -59,44 +57,21 @@ def test_wakenet_p4(dut: Dut)-> None:
@pytest.mark.parametrize(
    'config',
    [
+        'afe',
        'wn9_hilexin',
-        'vadnet',
    ],
 )
 def test_sr_afe(dut: Dut)-> None:
-    dut.run_all_single_board_cases(group="afe_sr", timeout=100000)
+    dut.run_all_single_board_cases(group="afe", timeout=3600)

@pytest.mark.target('esp32p4')
@pytest.mark.env('esp32p4')
@pytest.mark.parametrize(
    'config',
    [
-        'p4_mn7_cn',
+        'p4_afe',
+        'p4_wn9_hilexin',
    ],
 )
 def test_sr_afe_p4(dut: Dut)-> None:
-    dut.run_all_single_board_cases(group="afe_sr", timeout=100000)
-
-
-@pytest.mark.target('esp32s3')
-@pytest.mark.env('esp32s3')
-@pytest.mark.parametrize(
-    'config',
-    [
-        'nsnet2',
-    ],
-)
-def test_vc_afe(dut: Dut)-> None:
-    dut.run_all_single_board_cases(group="afe_vc", timeout=100000)
-
-
-@pytest.mark.target('esp32p4')
-@pytest.mark.env('esp32p4')
-@pytest.mark.parametrize(
-    'config',
-    [
-        'p4_nsnet2',
-    ],
-)
-def test_vc_afe_p4(dut: Dut)-> None:
-    dut.run_all_single_board_cases(group="afe_vc", timeout=100000)
+    dut.run_all_single_board_cases(group="afe", timeout=3600)
--- a/test_apps/esp-sr/sdkconfig.ci.vadnet
+++ b/test_apps/esp-sr/sdkconfig.ci.vadnet
@ -2,20 +2,22 @@
 # Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration
 #
 CONFIG_IDF_TARGET="esp32s3"
-CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16
 CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
 CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
 CONFIG_PARTITION_TABLE_CUSTOM=y
 CONFIG_SR_VADN_VADNET1_MEDIUM=y
 CONFIG_SR_WN_WN9_HILEXIN=y
+CONFIG_SR_NSN_NSNET2=y
 CONFIG_SPIRAM=y
+CONFIG_ESP_TASK_WDT_EN=n
+CONFIG_ESP_TASK_WDT_INIT=n
+CONFIG_ESP_MAIN_TASK_STACK_SIZE=10240
 CONFIG_SPIRAM_MODE_OCT=y
 CONFIG_SPIRAM_SPEED_80M=y
 CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y
 CONFIG_ESP32S3_INSTRUCTION_CACHE_32KB=y
 CONFIG_ESP32S3_DATA_CACHE_64KB=y
 CONFIG_ESP32S3_DATA_CACHE_LINE_64B=y
-CONFIG_ESP_MAIN_TASK_STACK_SIZE=8192
 CONFIG_ESP_WIFI_GMAC_SUPPORT=n
 CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID=y
 CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS=y
--- a/test_apps/esp-sr/sdkconfig.ci.mn2_cn
+++ b/test_apps/esp-sr/sdkconfig.ci.mn2_cn
@ -1,5 +1,5 @@
 # This file was generated using idf.py save-defconfig. It can be edited manually.
-# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration
+# Espressif IoT Development Framework (ESP-IDF) 5.3.1 Project Minimal Configuration
 #
 CONFIG_IDF_TARGET="esp32"
 CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16
@ -9,6 +9,10 @@ CONFIG_ESPTOOLPY_FLASHSIZE_8MB=y
 CONFIG_PARTITION_TABLE_CUSTOM=y
 CONFIG_PARTITION_TABLE_CUSTOM_FILENAME="partitions_esp32.csv"
 CONFIG_SR_MN_CN_MULTINET2_SINGLE_RECOGNITION=y
+CONFIG_COMPILER_OPTIMIZATION_PERF=y
+CONFIG_SPIRAM=y
+CONFIG_SPIRAM_SPEED_80M=y
+CONFIG_ESP_INT_WDT_TIMEOUT_MS=1000
 CONFIG_ESP_WIFI_GMAC_SUPPORT=n
 CONFIG_LWIP_TCP_SND_BUF_DEFAULT=5744
 CONFIG_LWIP_TCP_WND_DEFAULT=5744
--- a/test_apps/esp-sr/sdkconfig.ci.nsnet2
+++ b/test_apps/esp-sr/sdkconfig.ci.nsnet2
@ -1,23 +0,0 @@
-# This file was generated using idf.py save-defconfig. It can be edited manually.
-# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration
-#
-CONFIG_IDF_TARGET="esp32s3"
-CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16
-CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
-CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
-CONFIG_PARTITION_TABLE_CUSTOM=y
-CONFIG_SR_NSN_NSNET2=y
-CONFIG_SPIRAM=y
-CONFIG_SPIRAM_MODE_OCT=y
-CONFIG_SPIRAM_SPEED_80M=y
-CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y
-CONFIG_ESP32S3_INSTRUCTION_CACHE_32KB=y
-CONFIG_ESP32S3_DATA_CACHE_64KB=y
-CONFIG_ESP32S3_DATA_CACHE_LINE_64B=y
-CONFIG_ESP_MAIN_TASK_STACK_SIZE=8192
-CONFIG_ESP_WIFI_GMAC_SUPPORT=n
-CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID=y
-CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS=y
-CONFIG_LWIP_TCP_SND_BUF_DEFAULT=5744
-CONFIG_LWIP_TCP_WND_DEFAULT=5744
-CONFIG_UNITY_CRITICAL_LEAK_LEVEL_GENERAL=1024
--- a/test_apps/esp-sr/sdkconfig.ci.p4_afe
+++ b/test_apps/esp-sr/sdkconfig.ci.p4_afe
@ -0,0 +1,23 @@
+# This file was generated using idf.py save-defconfig. It can be edited manually.
+# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration
+#
+CONFIG_IDF_TARGET="esp32p4"
+CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
+CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
+CONFIG_PARTITION_TABLE_CUSTOM=y
+CONFIG_SR_VADN_VADNET1_MEDIUM=y
+CONFIG_SR_WN_WN9_HILEXIN=y
+CONFIG_SR_NSN_NSNET2=y
+CONFIG_SPIRAM=y
+CONFIG_ESP_TASK_WDT_EN=n
+CONFIG_ESP_TASK_WDT_INIT=n
+CONFIG_ESP_MAIN_TASK_STACK_SIZE=10240
+CONFIG_COMPILER_OPTIMIZATION_PERF=y
+CONFIG_ESP32P4_REV_MIN_0=y
+CONFIG_SPIRAM=y
+CONFIG_SPIRAM_SPEED_200M=y
+CONFIG_CACHE_L2_CACHE_256KB=y
+CONFIG_CACHE_L2_CACHE_LINE_128B=y
+CONFIG_ESP_SYSTEM_ALLOW_RTC_FAST_MEM_AS_HEAP=n
+CONFIG_MBEDTLS_CMAC_C=y
+CONFIG_IDF_EXPERIMENTAL_FEATURES=y
--- a/test_apps/esp-sr/sdkconfig.ci.p4_mn7_cn
+++ b/test_apps/esp-sr/sdkconfig.ci.p4_mn7_cn
@ -1,10 +1,12 @@
 # This file was generated using idf.py save-defconfig. It can be edited manually.
-# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration
+# Espressif IoT Development Framework (ESP-IDF) 5.3.1 Project Minimal Configuration
 #
 CONFIG_IDF_TARGET="esp32p4"
 CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
 CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
 CONFIG_PARTITION_TABLE_CUSTOM=y
+CONFIG_SR_NSN_NSNET2=y
+CONFIG_SR_VADN_VADNET1_MEDIUM=y
 CONFIG_SR_WN_WN9_HILEXIN=y
 CONFIG_SR_MN_CN_MULTINET7_QUANT=y
 CONFIG_COMPILER_OPTIMIZATION_PERF=y
@ -14,7 +16,6 @@ CONFIG_SPIRAM_SPEED_200M=y
 CONFIG_CACHE_L2_CACHE_256KB=y
 CONFIG_CACHE_L2_CACHE_LINE_128B=y
 CONFIG_ESP_SYSTEM_ALLOW_RTC_FAST_MEM_AS_HEAP=n
-CONFIG_ESP_MAIN_TASK_STACK_SIZE=8000
 CONFIG_ESP_INT_WDT=n
 CONFIG_ESP_TASK_WDT_EN=n
 CONFIG_FREERTOS_HZ=1000
--- a/test_apps/esp-sr/sdkconfig.ci.p4_mn7_en
+++ b/test_apps/esp-sr/sdkconfig.ci.p4_mn7_en
@ -1,10 +1,12 @@
 # This file was generated using idf.py save-defconfig. It can be edited manually.
-# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration
+# Espressif IoT Development Framework (ESP-IDF) 5.3.1 Project Minimal Configuration
 #
 CONFIG_IDF_TARGET="esp32p4"
 CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
 CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
 CONFIG_PARTITION_TABLE_CUSTOM=y
+CONFIG_SR_NSN_NSNET2=y
+CONFIG_SR_VADN_VADNET1_MEDIUM=y
 CONFIG_SR_WN_WN9_HIESP=y
 CONFIG_SR_MN_EN_MULTINET7_QUANT=y
 CONFIG_COMPILER_OPTIMIZATION_PERF=y
--- a/test_apps/esp-sr/sdkconfig.ci.p4_wn9_hilexin
+++ b/test_apps/esp-sr/sdkconfig.ci.p4_wn9_hilexin
@ -1,22 +1,20 @@
 # This file was generated using idf.py save-defconfig. It can be edited manually.
-# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration
+# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration
 #
 CONFIG_IDF_TARGET="esp32p4"
 CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
 CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
 CONFIG_PARTITION_TABLE_CUSTOM=y
-CONFIG_SR_WN_WN9_HIESP=y
-CONFIG_SR_NSN_NSNET2=y
+CONFIG_SR_WN_WN9_HILEXIN=y
+CONFIG_SPIRAM=y
+CONFIG_ESP_TASK_WDT_EN=n
+CONFIG_ESP_TASK_WDT_INIT=n
+CONFIG_ESP_MAIN_TASK_STACK_SIZE=10240
 CONFIG_COMPILER_OPTIMIZATION_PERF=y
 CONFIG_ESP32P4_REV_MIN_0=y
-CONFIG_SPIRAM=y
 CONFIG_SPIRAM_SPEED_200M=y
 CONFIG_CACHE_L2_CACHE_256KB=y
 CONFIG_CACHE_L2_CACHE_LINE_128B=y
 CONFIG_ESP_SYSTEM_ALLOW_RTC_FAST_MEM_AS_HEAP=n
-CONFIG_ESP_MAIN_TASK_STACK_SIZE=10000
-CONFIG_ESP_INT_WDT=n
-CONFIG_ESP_TASK_WDT_EN=n
-CONFIG_FREERTOS_HZ=1000
 CONFIG_MBEDTLS_CMAC_C=y
 CONFIG_IDF_EXPERIMENTAL_FEATURES=y
--- a/test_apps/esp-sr/sdkconfig.ci.wn9_hilexin
+++ b/test_apps/esp-sr/sdkconfig.ci.wn9_hilexin
@ -2,13 +2,13 @@
 # Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration
 #
 CONFIG_IDF_TARGET="esp32s3"
-CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16
 CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
 CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
 CONFIG_PARTITION_TABLE_CUSTOM=y
 CONFIG_SR_WN_WN9_HILEXIN=y
-CONFIG_ESP_PHY_REDUCE_TX_POWER=y
 CONFIG_SPIRAM=y
+CONFIG_ESP_TASK_WDT_EN=n
+CONFIG_ESP_TASK_WDT_INIT=n
 CONFIG_SPIRAM_MODE_OCT=y
 CONFIG_SPIRAM_SPEED_80M=y
 CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y
@ -21,4 +21,4 @@ CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID=y
 CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS=y
 CONFIG_LWIP_TCP_SND_BUF_DEFAULT=5744
 CONFIG_LWIP_TCP_WND_DEFAULT=5744
-CONFIG_UNITY_CRITICAL_LEAK_LEVEL_GENERAL=1024
+CONFIG_UNITY_CRITICAL_LEAK_LEVEL_GENERAL=1024
--- a/test_apps/esp-sr/sdkconfig.defaults
+++ b/test_apps/esp-sr/sdkconfig.defaults
@ -0,0 +1,6 @@
+# This file was generated using idf.py save-defconfig. It can be edited manually.
+# Espressif IoT Development Framework (ESP-IDF) 5.3.1 Project Minimal Configuration
+#
+
+CONFIG_PARTITION_TABLE_CUSTOM=y
+CONFIG_SR_VADN_VADNET1_MEDIUM=y