Merge branch 'refactor/afe' into 'master'

refactor(esp32s3): update AFE interface

See merge request speech-recognition-framework/esp-sr!131
This commit is contained in:
Sun Xiang Yu 2025-02-05 16:52:46 +08:00
commit 3b549e2d91
76 changed files with 2232 additions and 1513 deletions

View File

@ -315,4 +315,4 @@ push_to_github:
- echo -e "Host github.com\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config
- git remote remove github &>/dev/null || true
- git remote add github git@github.com:espressif/esp-sr.git
- git push github "${CI_COMMIT_SHA}:refs/heads/${CI_COMMIT_REF_NAME}"
- git push github "${CI_COMMIT_SHA}:refs/heads/${CI_COMMIT_REF_NAME}"

View File

@ -1,13 +1,14 @@
if(IDF_TARGET STREQUAL "esp32")
if((${IDF_TARGET} STREQUAL "esp32s3") OR (${IDF_TARGET} STREQUAL "esp32p4") OR (${IDF_TARGET} STREQUAL "esp32"))
set(include_dirs
src/include
esp-tts/esp_tts_chinese/include
include/esp32
"esp-tts/esp_tts_chinese/include"
"include/${IDF_TARGET}"
"src/include"
)
set(srcs
src/model_path.c
src/esp_mn_speech_commands.c
src/esp_process_sdkconfig.c
"src/model_path.c"
"src/esp_mn_speech_commands.c"
"src/esp_process_sdkconfig.c"
)
set(requires
@ -20,244 +21,320 @@ if(IDF_TARGET STREQUAL "esp32")
ENDIF (IDF_VERSION_MAJOR GREATER 4)
idf_component_register(SRCS ${srcs}
INCLUDE_DIRS ${include_dirs}
REQUIRES ${requires}
PRIV_REQUIRES spi_flash)
INCLUDE_DIRS ${include_dirs}
REQUIRES ${requires}
PRIV_REQUIRES spi_flash)
target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32")
target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32")
add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME})
target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
multinet
dl_lib
c_speech_features
wakeword_model
multinet2_ch
esp_audio_processor
esp_audio_front_end
esp_tts_chinese
voice_set_xiaole
wakenet
"-Wl,--end-group")
elseif(${IDF_TARGET} STREQUAL "esp32s3")
set(include_dirs
src/include
esp-tts/esp_tts_chinese/include
include/esp32s3
)
set(srcs
src/model_path.c
src/esp_mn_speech_commands.c
src/esp_process_sdkconfig.c
)
set(requires
json
spiffs
)
IF (IDF_VERSION_MAJOR GREATER 4)
list(APPEND requires esp_partition)
ENDIF (IDF_VERSION_MAJOR GREATER 4)
idf_component_register(SRCS ${srcs}
INCLUDE_DIRS ${include_dirs}
REQUIRES ${requires}
PRIV_REQUIRES spi_flash)
target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3")
target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32s3")
add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(nsnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libnsnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}")
target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}")
add_prebuilt_library(dl_lib "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libdl_lib.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(c_speech_features "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libc_speech_features.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(esp_tts_chinese "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libesp_tts_chinese.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(voice_set_xiaole "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libvoice_set_xiaole.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(fst "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libfst.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(nsnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libnsnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
idf_component_get_property(esp_dsp_lib espressif__esp-dsp COMPONENT_LIB)
target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
hufzip
set(sr_libs
dl_lib
fst
c_speech_features
$<TARGET_FILE:${esp_dsp_lib}>
c_speech_features
esp_audio_front_end
esp_audio_processor
multinet
flite_g2p
esp_tts_chinese
voice_set_xiaole
fst
flite_g2p
hufzip
multinet
nsnet
vadnet
wakenet
"-Wl,--end-group")
wakenet)
set(MVMODEL_EXE ${COMPONENT_PATH}/model/movemodel.py)
idf_build_get_property(build_dir BUILD_DIR)
set(image_file ${build_dir}/srmodels/srmodels.bin)
add_custom_command(
OUTPUT ${image_file}
COMMENT "Move and Pack models..."
COMMAND python ${MVMODEL_EXE} -d1 ${SDKCONFIG} -d2 ${COMPONENT_PATH} -d3 ${build_dir}
DEPENDS ${SDKCONFIG}
VERBATIM)
add_custom_target(srmodels_bin ALL DEPENDS ${image_file})
add_dependencies(flash srmodels_bin)
partition_table_get_partition_info(size "--partition-name model" "size")
partition_table_get_partition_info(offset "--partition-name model" "offset")
if("${size}" AND "${offset}")
esptool_py_flash_to_partition(flash "model" "${image_file}")
else()
set(message "Failed to find model in partition table file"
"Please add a line(Name=model, Size>recommended size in log) to the partition file.")
if(${IDF_TARGET} STREQUAL "esp32")
add_prebuilt_library(multinet2_ch "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libmultinet2_ch.a" PRIV_REQUIRES ${COMPONENT_NAME})
list(APPEND sr_libs multinet2_ch)
endif()
elseif(${IDF_TARGET} STREQUAL "esp32p4")
set(include_dirs
src/include
esp-tts/esp_tts_chinese/include
include/esp32p4
)
set(srcs
src/model_path.c
src/esp_mn_speech_commands.c
src/esp_process_sdkconfig.c
)
set(requires
json
spiffs
)
IF (IDF_VERSION_MAJOR GREATER 4)
list(APPEND requires esp_partition)
ENDIF (IDF_VERSION_MAJOR GREATER 4)
idf_component_register(SRCS ${srcs}
INCLUDE_DIRS ${include_dirs}
REQUIRES ${requires}
PRIV_REQUIRES spi_flash)
target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4")
target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32p4")
add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(nsnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libnsnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
idf_component_get_property(esp_dsp_lib espressif__esp-dsp COMPONENT_LIB)
target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
hufzip
dl_lib
fst
c_speech_features
$<TARGET_FILE:${esp_dsp_lib}>
esp_audio_front_end
esp_audio_processor
multinet
flite_g2p
esp_tts_chinese
voice_set_xiaole
wakenet
vadnet
nsnet
${sr_libs}
"-Wl,--end-group")
set(MVMODEL_EXE ${COMPONENT_PATH}/model/movemodel.py)
idf_build_get_property(build_dir BUILD_DIR)
set(image_file ${build_dir}/srmodels/srmodels.bin)
add_custom_command(
OUTPUT ${image_file}
COMMENT "Move and Pack models..."
COMMAND python ${MVMODEL_EXE} -d1 ${SDKCONFIG} -d2 ${COMPONENT_PATH} -d3 ${build_dir}
DEPENDS ${SDKCONFIG}
VERBATIM)
if(CONFIG_IDF_TARGET_ESP32S3 OR CONFIG_IDF_TARGET_ESP32P4)
set(MVMODEL_EXE ${COMPONENT_PATH}/model/movemodel.py)
idf_build_get_property(build_dir BUILD_DIR)
set(image_file ${build_dir}/srmodels/srmodels.bin)
add_custom_target(srmodels_bin ALL DEPENDS ${image_file})
add_dependencies(flash srmodels_bin)
add_custom_command(
OUTPUT ${image_file}
COMMENT "Move and Pack models..."
COMMAND python ${MVMODEL_EXE} -d1 ${SDKCONFIG} -d2 ${COMPONENT_PATH} -d3 ${build_dir}
DEPENDS ${SDKCONFIG}
VERBATIM)
partition_table_get_partition_info(size "--partition-name model" "size")
partition_table_get_partition_info(offset "--partition-name model" "offset")
add_custom_target(srmodels_bin ALL DEPENDS ${image_file})
add_dependencies(flash srmodels_bin)
if("${size}" AND "${offset}")
esptool_py_flash_to_partition(flash "model" "${image_file}")
else()
set(message "Failed to find model in partition table file"
"Please add a line(Name=model, Size>recommended size in log) to the partition file.")
partition_table_get_partition_info(size "--partition-name model" "size")
partition_table_get_partition_info(offset "--partition-name model" "offset")
if("${size}" AND "${offset}")
esptool_py_flash_to_partition(flash "model" "${image_file}")
else()
set(message "Failed to find model in partition table file"
"Please add a line(Name=model, Size>recommended size in log) to the partition file.")
endif()
endif()
elseif(${IDF_TARGET} STREQUAL "esp32s2")
set(requires
elseif((${IDF_TARGET} STREQUAL "esp32s2") OR (${IDF_TARGET} STREQUAL "esp32c3") OR (${IDF_TARGET} STREQUAL "esp32c6"))
set(requires
spiffs
)
IF (IDF_VERSION_MAJOR GREATER 4)
list(APPEND requires esp_partition)
ENDIF (IDF_VERSION_MAJOR GREATER 4)
IF (IDF_VERSION_MAJOR GREATER 4)
list(APPEND requires esp_partition)
ENDIF (IDF_VERSION_MAJOR GREATER 4)
idf_component_register(SRCS .
INCLUDE_DIRS esp-tts/esp_tts_chinese/include
REQUIRES ${requires}
PRIV_REQUIRES spi_flash)
idf_component_register(SRCS .
INCLUDE_DIRS esp-tts/esp_tts_chinese/include
REQUIRES ${requires}
PRIV_REQUIRES spi_flash)
target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32s2")
target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group"
esp_tts_chinese
voice_set_xiaole
"-Wl,--end-group")
elseif(${IDF_TARGET} STREQUAL "esp32c3")
set(requires
spiffs
)
target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}")
add_prebuilt_library(esp_tts_chinese "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libesp_tts_chinese.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(voice_set_xiaole "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libvoice_set_xiaole.a" PRIV_REQUIRES ${COMPONENT_NAME})
target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group"
esp_tts_chinese
voice_set_xiaole
"-Wl,--end-group")
IF (IDF_VERSION_MAJOR GREATER 4)
list(APPEND requires esp_partition)
ENDIF (IDF_VERSION_MAJOR GREATER 4)
idf_component_register(SRCS .
INCLUDE_DIRS esp-tts/esp_tts_chinese/include
REQUIRES ${requires}
PRIV_REQUIRES spi_flash)
target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32c3")
target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group"
esp_tts_chinese
voice_set_xiaole
"-Wl,--end-group")
elseif(${IDF_TARGET} STREQUAL "esp32c6")
set(requires
spiffs
)
IF (IDF_VERSION_MAJOR GREATER 4)
list(APPEND requires esp_partition)
ENDIF (IDF_VERSION_MAJOR GREATER 4)
idf_component_register(SRCS .
INCLUDE_DIRS esp-tts/esp_tts_chinese/include
REQUIRES ${requires}
PRIV_REQUIRES spi_flash)
target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32c6")
target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group"
esp_tts_chinese
voice_set_xiaole
"-Wl,--end-group")
endif()
# elseif(${IDF_TARGET} STREQUAL "esp32s3")
# set(include_dirs
# src/include
# esp-tts/esp_tts_chinese/include
# include/esp32s3
# )
# set(srcs
# src/model_path.c
# src/esp_mn_speech_commands.c
# src/esp_process_sdkconfig.c
# )
# set(requires
# json
# spiffs
# )
# IF (IDF_VERSION_MAJOR GREATER 4)
# list(APPEND requires esp_partition)
# ENDIF (IDF_VERSION_MAJOR GREATER 4)
# idf_component_register(SRCS ${srcs}
# INCLUDE_DIRS ${include_dirs}
# REQUIRES ${requires}
# PRIV_REQUIRES spi_flash)
# target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3")
# target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32s3")
# add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME})
# add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME})
# add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
# add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
# add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME})
# add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME})
# add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME})
# add_prebuilt_library(nsnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libnsnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
# idf_component_get_property(esp_dsp_lib espressif__esp-dsp COMPONENT_LIB)
# target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
# hufzip
# dl_lib
# fst
# c_speech_features
# $<TARGET_FILE:${esp_dsp_lib}>
# esp_audio_front_end
# esp_audio_processor
# multinet
# flite_g2p
# esp_tts_chinese
# voice_set_xiaole
# nsnet
# vadnet
# wakenet
# "-Wl,--end-group")
# set(MVMODEL_EXE ${COMPONENT_PATH}/model/movemodel.py)
# idf_build_get_property(build_dir BUILD_DIR)
# set(image_file ${build_dir}/srmodels/srmodels.bin)
# add_custom_command(
# OUTPUT ${image_file}
# COMMENT "Move and Pack models..."
# COMMAND python ${MVMODEL_EXE} -d1 ${SDKCONFIG} -d2 ${COMPONENT_PATH} -d3 ${build_dir}
# DEPENDS ${SDKCONFIG}
# VERBATIM)
# add_custom_target(srmodels_bin ALL DEPENDS ${image_file})
# add_dependencies(flash srmodels_bin)
# partition_table_get_partition_info(size "--partition-name model" "size")
# partition_table_get_partition_info(offset "--partition-name model" "offset")
# if("${size}" AND "${offset}")
# esptool_py_flash_to_partition(flash "model" "${image_file}")
# else()
# set(message "Failed to find model in partition table file"
# "Please add a line(Name=model, Size>recommended size in log) to the partition file.")
# endif()
# elseif(${IDF_TARGET} STREQUAL "esp32p4")
# set(include_dirs
# src/include
# esp-tts/esp_tts_chinese/include
# include/esp32p4
# )
# set(srcs
# src/model_path.c
# src/esp_mn_speech_commands.c
# src/esp_process_sdkconfig.c
# )
# set(requires
# json
# spiffs
# )
# IF (IDF_VERSION_MAJOR GREATER 4)
# list(APPEND requires esp_partition)
# ENDIF (IDF_VERSION_MAJOR GREATER 4)
# idf_component_register(SRCS ${srcs}
# INCLUDE_DIRS ${include_dirs}
# REQUIRES ${requires}
# PRIV_REQUIRES spi_flash)
# target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4")
# target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32p4")
# add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME})
# add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME})
# add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
# add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
# add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME})
# add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME})
# add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME})
# add_prebuilt_library(nsnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libnsnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
# idf_component_get_property(esp_dsp_lib espressif__esp-dsp COMPONENT_LIB)
# target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
# hufzip
# dl_lib
# fst
# c_speech_features
# $<TARGET_FILE:${esp_dsp_lib}>
# esp_audio_front_end
# esp_audio_processor
# multinet
# flite_g2p
# esp_tts_chinese
# voice_set_xiaole
# wakenet
# vadnet
# nsnet
# "-Wl,--end-group")
# set(MVMODEL_EXE ${COMPONENT_PATH}/model/movemodel.py)
# idf_build_get_property(build_dir BUILD_DIR)
# set(image_file ${build_dir}/srmodels/srmodels.bin)
# add_custom_command(
# OUTPUT ${image_file}
# COMMENT "Move and Pack models..."
# COMMAND python ${MVMODEL_EXE} -d1 ${SDKCONFIG} -d2 ${COMPONENT_PATH} -d3 ${build_dir}
# DEPENDS ${SDKCONFIG}
# VERBATIM)
# add_custom_target(srmodels_bin ALL DEPENDS ${image_file})
# add_dependencies(flash srmodels_bin)
# partition_table_get_partition_info(size "--partition-name model" "size")
# partition_table_get_partition_info(offset "--partition-name model" "offset")
# if("${size}" AND "${offset}")
# esptool_py_flash_to_partition(flash "model" "${image_file}")
# else()
# set(message "Failed to find model in partition table file"
# "Please add a line(Name=model, Size>recommended size in log) to the partition file.")
# endif()
# elseif(${IDF_TARGET} STREQUAL "esp32s2")
# set(requires
# spiffs
# )
# IF (IDF_VERSION_MAJOR GREATER 4)
# list(APPEND requires esp_partition)
# ENDIF (IDF_VERSION_MAJOR GREATER 4)
# idf_component_register(SRCS .
# INCLUDE_DIRS esp-tts/esp_tts_chinese/include
# REQUIRES ${requires}
# PRIV_REQUIRES spi_flash)
# target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32s2")
# target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group"
# esp_tts_chinese
# voice_set_xiaole
# "-Wl,--end-group")
# elseif(${IDF_TARGET} STREQUAL "esp32c3")
# set(requires
# spiffs
# )
# IF (IDF_VERSION_MAJOR GREATER 4)
# list(APPEND requires esp_partition)
# ENDIF (IDF_VERSION_MAJOR GREATER 4)
# idf_component_register(SRCS .
# INCLUDE_DIRS esp-tts/esp_tts_chinese/include
# REQUIRES ${requires}
# PRIV_REQUIRES spi_flash)
# target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32c3")
# target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group"
# esp_tts_chinese
# voice_set_xiaole
# "-Wl,--end-group")
# elseif(${IDF_TARGET} STREQUAL "esp32c6")
# set(requires
# spiffs
# )
# IF (IDF_VERSION_MAJOR GREATER 4)
# list(APPEND requires esp_partition)
# ENDIF (IDF_VERSION_MAJOR GREATER 4)
# idf_component_register(SRCS .
# INCLUDE_DIRS esp-tts/esp_tts_chinese/include
# REQUIRES ${requires}
# PRIV_REQUIRES spi_flash)
# target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32c6")
# target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group"
# esp_tts_chinese
# voice_set_xiaole
# "-Wl,--end-group")
# endif()

View File

@ -45,6 +45,23 @@ Resource Consumption
| AFE Layer | 227 KB | | |
+-----------------+-----------------+-----------------+-----------------+
+--------------+------+-----------+---------------+------------+----------------+-----------------+
| Input Format | Type | Mode | Internal RAM | PSRAM | Feed Task CPU | Fetch Task CPU |
+==============+======+===========+===============+============+================+=================+
| MR | SR | LOW_COST | 72348 | 732932 | 8.4% | 14.9% |
+--------------+------+-----------+---------------+------------+----------------+-----------------+
| MR | SR | HIGH_PERF | 78016 | 734980 | 9.4% | 14.9% |
+--------------+------+-----------+---------------+------------+----------------+-----------------+
| MR | VC | LOW_COST | 50316 | 821564 | 60.0% | 8.1% |
+--------------+------+-----------+---------------+------------+----------------+-----------------+
| MR | VC | HIGH_PERF | 93668 | 824144 | 64.0% | 8.2% |
+--------------+------+-----------+---------------+------------+----------------+-----------------+
| MMR | SR | LOW_COST | 76684 | 1175148 | 36.6% | 30.2% |
+--------------+------+-----------+---------------+------------+----------------+-----------------+
| MMR | SR | HIGH_PERF | 99064 | 1174960 | 38.8% | 30.0% |
+--------------+------+-----------+---------------+------------+----------------+-----------------+
.. only:: esp32p4
+-----------------+-----------------+-----------------+-----------------+
@ -52,21 +69,33 @@ Resource Consumption
| | | loading(compute | |
| | | with 2 cores) | |
+=================+=================+=================+=================+
| AEC(LOW_COST) | 152.3 KB | 8% | 32 ms |
| AEC(LOW_COST) | 152.3 KB | 6% | 32 ms |
+-----------------+-----------------+-----------------+-----------------+
| AEC(HIGH_PERF) | 166 KB | 11% | 32 ms |
| BSS(LOW_COST) | 198.7 KB | 3% | 64 ms |
+-----------------+-----------------+-----------------+-----------------+
| BSS(LOW_COST) | 198.7 KB | 6% | 64 ms |
+-----------------+-----------------+-----------------+-----------------+
| BSS(HIGH_PERF) | 215.5 KB | 7% | 64 ms |
+-----------------+-----------------+-----------------+-----------------+
| NS | 27 KB | 5% | 10 ms |
| NS | 27 KB | 3% | 10 ms |
+-----------------+-----------------+-----------------+-----------------+
| MISO | 56 KB | 8% | 16 ms |
+-----------------+-----------------+-----------------+-----------------+
| AFE Layer | 227 KB | | |
+-----------------+-----------------+-----------------+-----------------+
+--------------+------+-----------+---------------+------------+-----------------+-----------------+
| Input Format | Type | Mode | Internal RAM | PSRAM | Feed Task CPU | Fetch Task CPU |
+==============+======+===========+===============+============+=================+=================+
| MR | SR | LOW_COST | 75404 | 751292 | 10.6% | 11.3% |
+--------------+------+-----------+---------------+------------+-----------------+-----------------+
| MR | SR | HIGH_PERF | 75128 | 751292 | 10.6% | 11.3% |
+--------------+------+-----------+---------------+------------+-----------------+-----------------+
| MR | VC | LOW_COST | 76192 | 841300 | 40.3% | 5.7% |
+--------------+------+-----------+---------------+------------+-----------------+-----------------+
| MR | VC | HIGH_PERF | 119536 | 843880 | 42.6% | 5.7% |
+--------------+------+-----------+---------------+------------+-----------------+-----------------+
| MMR | SR | LOW_COST | 79940 | 1202692 | 28.4% | 24.9% |
+--------------+------+-----------+---------------+------------+-----------------+-----------------+
| MMR | SR | HIGH_PERF | 79940 | 1202692 | 28.4% | 24.9% |
+--------------+------+-----------+---------------+------------+-----------------+-----------------+
WakeNet
-------

View File

@ -49,6 +49,22 @@ AFE
| AFE Layer | 227 KB | | |
+-----------------+-----------------+-----------------+-----------------+
+--------------+------+-----------+---------------+------------+----------------+-----------------+
| Input Format | Type | Mode | Internal RAM | PSRAM | Feed Task CPU | Fetch Task CPU |
+==============+======+===========+===============+============+================+=================+
| MR | SR | LOW_COST | 72348 | 732932 | 8.4% | 14.9% |
+--------------+------+-----------+---------------+------------+----------------+-----------------+
| MR | SR | HIGH_PERF | 78016 | 734980 | 9.4% | 14.9% |
+--------------+------+-----------+---------------+------------+----------------+-----------------+
| MR | VC | LOW_COST | 50316 | 821564 | 60.0% | 8.1% |
+--------------+------+-----------+---------------+------------+----------------+-----------------+
| MR | VC | HIGH_PERF | 93668 | 824144 | 64.0% | 8.2% |
+--------------+------+-----------+---------------+------------+----------------+-----------------+
| MMR | SR | LOW_COST | 76684 | 1175148 | 36.6% | 30.2% |
+--------------+------+-----------+---------------+------------+----------------+-----------------+
| MMR | SR | HIGH_PERF | 99064 | 1174960 | 38.8% | 30.0% |
+--------------+------+-----------+---------------+------------+----------------+-----------------+
.. only:: esp32p4
+-----------------+-----------------+-----------------+-----------------+
@ -67,6 +83,22 @@ AFE
| AFE Layer | 227 KB | | |
+-----------------+-----------------+-----------------+-----------------+
+--------------+------+-----------+---------------+------------+-----------------+-----------------+
| Input Format | Type | Mode | Internal RAM | PSRAM | Feed Task CPU | Fetch Task CPU |
+==============+======+===========+===============+============+=================+=================+
| MR | SR | LOW_COST | 75404 | 751292 | 10.6% | 11.3% |
+--------------+------+-----------+---------------+------------+-----------------+-----------------+
| MR | SR | HIGH_PERF | 75128 | 751292 | 10.6% | 11.3% |
+--------------+------+-----------+---------------+------------+-----------------+-----------------+
| MR | VC | LOW_COST | 76192 | 841300 | 40.3% | 5.7% |
+--------------+------+-----------+---------------+------------+-----------------+-----------------+
| MR | VC | HIGH_PERF | 119536 | 843880 | 42.6% | 5.7% |
+--------------+------+-----------+---------------+------------+-----------------+-----------------+
| MMR | SR | LOW_COST | 79940 | 1202692 | 28.4% | 24.9% |
+--------------+------+-----------+---------------+------------+-----------------+-----------------+
| MMR | SR | HIGH_PERF | 79940 | 1202692 | 28.4% | 24.9% |
+--------------+------+-----------+---------------+------------+-----------------+-----------------+
WakeNet
-------

View File

@ -21,80 +21,72 @@ extern "C" {
#endif
#define USE_AEC_FFT // Not kiss_fft
#define AEC_USE_SPIRAM 0
#define AEC_SAMPLE_RATE 16000 // Only Support 16000Hz
//#define AEC_FRAME_LENGTH_MS 16
#define AEC_FRAME_LENGTH_MS 32
#define AEC_FILTER_LENGTH 1200 // Number of samples of echo to cancel
typedef void* aec_handle_t;
typedef struct aec_handle_t aec_handle_t;
typedef enum {
AEC_MODE_SR_LOW_COST = 0, // Low Cost AEC fro speech recognition
AEC_MODE_SR_HIGH_PERF = 1, // High Perforamce AEC for speech recognition
AEC_MODE_VOIP_LOW_COST = 3, // Low Cost AEC for voice communication
AEC_MODE_VOIP_HIGH_PERF = 4, // High Perforamce AEC for voice communication
} aec_mode_t;
/**
* @brief Creates an instance to the AEC structure.
* Please get frame size by aec_get_chunksize() function
*
* @deprecated This API will be deprecated after version 1.0, please use aec_pro_create
*
* @param sample_rate The Sampling frequency (Hz) must be 16000.
*
* @param frame_length The length of the audio processing must be 16ms.
*
* @param filter_length Number of samples of echo to cancel.
*
* @param filter_length Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
* @param channel_num The input microphone channel number
* @param mode The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
* @return
* - NULL: Create failed
* - Others: The instance of AEC
*/
aec_handle_t aec_create(int sample_rate, int frame_length, int filter_length);
aec_handle_t *aec_create(int sample_rate, int filter_length, int channel_num, aec_mode_t mode);
/**
* @brief Creates an instance to the AEC structure.
* @brief Creates an instance to the AEC structure, same with aec_create().
*
* @deprecated This API will be deprecated after version 1.0, please use aec_pro_create
*
* @param sample_rate The Sampling frequency (Hz) must be 16000.
*
* @param frame_length The length of the audio processing must be 16ms.
*
* @param filter_length Number of samples of echo to cancel.
*
* @param nch Number of input signal channel.
*
* @param filter_length Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
* @param channel_num The input microphone channel number
* @param mode The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
* @return
* - NULL: Create failed
* - Others: The instance of AEC
*/
aec_handle_t aec_create_multimic(int sample_rate, int frame_length, int filter_length, int nch);
/**
* @brief Creates an instance of more powerful AEC.
*
* @param frame_length Length of input signal. Must be 16ms if mode is 0; otherwise could be 16ms or 32ms. Length of input signal to aec_process must be modified accordingly.
*
* @param nch Number of microphones.
*
* @param mode Mode of AEC (0 to 5), indicating aggressiveness and RAM allocation. 0: mild; 1 or 2: medium (1: internal RAM, 2: SPIRAM); 3 and 4: aggressive (3: internal RAM, 4: SPIRAM); 5: agressive, accelerated for ESP32-S3.
*
* @return
* - NULL: Create failed
* - Others: An Instance of AEC
*/
aec_handle_t aec_pro_create(int frame_length, int nch, int mode);
aec_handle_t *aec_pro_create(int filter_length, int channel_num, aec_mode_t mode);
/**
* @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
*
* @param inst The instance of AEC.
*
* @warning The indata, refdata and outdata must be 16-bit signed. please allocate memory by heap_caps_aligned_alloc().
*
* @param inst The instance of AEC. Format for multi-channel data is "ch0 ch0 ch0 ..., ch1 ch1 ch1 ..."
* @param indata An array of 16-bit signed audio samples from mic.
*
* @param refdata An array of 16-bit signed audio samples sent to the speaker.
*
* @param outdata Returns near-end signal with echo removed.
*
* @param outdata Returns near-end signal with echo removed. Format for multi-channel data is "ch0 ch0 ch0..., ch1 ch1 ch1 ..."
* @return None
*
*/
void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int16_t *outdata);
void aec_process(const aec_handle_t *handel, int16_t *indata, int16_t *refdata, int16_t *outdata);
/**
* @brief Get frame size of AEC (the samples of one frame)
* @param handle The instance of AEC.
* @return Frame size
*/
int aec_get_chunksize(const aec_handle_t *handle);
/**
* @brief Get AEC mode string
*
* @param aec_mode The mode of AEC.
*
* @return AEC mode string
*/
char * aec_get_mode_string(aec_mode_t aec_mode);
/**
* @brief Free the AEC instance
@ -104,7 +96,7 @@ void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int
* @return None
*
*/
void aec_destroy(aec_handle_t inst);
void aec_destroy(aec_handle_t *handel);
#ifdef __cplusplus
}

View File

@ -1,24 +1,41 @@
#pragma once
#include "stdint.h"
#include "stdbool.h"
#include "stdlib.h"
#include "esp_wn_iface.h"
#include "esp_wn_models.h"
#include "esp_vad.h"
#include "esp_aec.h"
#include "esp_agc.h"
#include "model_path.h"
#include "esp_vadn_models.h"
#include "esp_nsn_models.h"
#ifdef __cplusplus
extern "C" {
#endif
//AFE: Audio Front-End
//SR: Speech Recognition
//afe_sr/AFE_SR: the audio front-end for speech recognition
//VC: Voice Communication
//Set AFE_SR mode
typedef enum {
SR_MODE_LOW_COST = 0,
SR_MODE_HIGH_PERF = 1
SR_MODE_LOW_COST = 0, //Deprecated, please use afe_mode_t, AFE mode: low cost mode
SR_MODE_HIGH_PERF = 1, //Deprecated, please use afe_mode_t, AFE mode: high performance mode
} afe_sr_mode_t;
//Set AFE mode
typedef enum {
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
} afe_mode_t;
//Set AFE type
typedef enum {
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
} afe_type_t;
typedef enum {
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
@ -26,24 +43,30 @@ typedef enum {
} afe_memory_alloc_mode_t;
typedef enum {
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of audio fed to multinet is -9dB
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of audio fed to multinet is -6dB
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of audio fed to multinet is -3dB
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
} afe_mn_peak_agc_mode_t;
typedef struct {
int total_ch_num; // total channel num. It must be: total_ch_num = mic_num + ref_num
int mic_num; // mic channel num
int ref_num; // reference channel num
int sample_rate; // sample rate of audio
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
int mic_num; // microphone channel number
uint8_t* mic_ids; // microphone channel indices
int ref_num; // playback reference channel number
uint8_t* ref_ids; // playback reference channel indices
int sample_rate; // sample rate of audio
} afe_pcm_config_t;
typedef enum {
NS_MODE_SSP = 0, // speech signal process method
NS_MODE_NET = 1, // deep noise suppression net method
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
AFE_NS_MODE_NET = 1, // please use model name of NSNET
} afe_ns_mode_t;
typedef enum {
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
} afe_agc_mode_t;
/**
* @brief Function to get the debug audio data
@ -66,148 +89,192 @@ typedef struct {
} afe_debug_hook_t;
typedef struct {
bool aec_init;
bool se_init;
bool vad_init;
/********** AEC(Acoustic Echo Cancellation) **********/
bool aec_init; // Whether to init aec
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
int aec_filter_length; // The filter length of aec
/********** SE(Speech Enhancement, microphone array processing) **********/
bool se_init; // Whether to init se
/********** NS(Noise Suppression) **********/
bool ns_init; // Whether to init ns
char *ns_model_name; // Model name of ns
afe_ns_mode_t afe_ns_mode; // Model mode of ns
/********** VAD(Voice Activity Detection) **********/
bool vad_init; // Whether to init vad
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
/********** WakeNet(Wake Word Engine) **********/
bool wakenet_init;
bool voice_communication_init;
bool voice_communication_agc_init; // AGC swich for voice communication
int voice_communication_agc_gain; // AGC gain(dB) for voice communication
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
char *wakenet_model_name; // The model name of wakenet 1
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
det_mode_t wakenet_mode;
afe_sr_mode_t afe_mode;
int afe_perferred_core;
int afe_perferred_priority;
int afe_ringbuf_size;
afe_memory_alloc_mode_t memory_alloc_mode;
float afe_linear_gain; // The linear gain for sr output(note: invaild for vc), the value should be in [0.1, 10.0].
// This value acts directly on the output amplitude: out_linear_gain * amplitude.
afe_mn_peak_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
det_mode_t wakenet_mode; // The mode of wakenet
/********** AGC(Automatic Gain Control) **********/
bool agc_init; // Whether to init agc
afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
int agc_compression_gain_db; // Compression gain in dB (default 9)
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
/********** General AFE(Audio Front End) parameter **********/
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
afe_mode_t afe_mode; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
afe_type_t afe_type; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude.
bool debug_init;
afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX];
afe_ns_mode_t afe_ns_mode;
char *afe_ns_model_name;
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
// otherwise, select channel number by wakenet
char *vad_model_name; // The model name of vad, support vadnet1 and vadnet1_small
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms
int vad_min_noise_ms; // The minimum duration of noise/silence in ms. It should be bigger than 64 ms
bool vad_mute_playback; // If true, the playback will be muted for vad detection
} afe_config_t;
/**
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format.
* You can manually fine-tune it after creating the configuration
*
* The input format:
* M to represent the microphone channel
* R to represent the playback reference channel
* N to represent an unknown or unused channel
*
* For example, input_format="MMNR" indicates that the input data consists of four channels,
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
*
* @param input_format The input format
* @param models Models from partition, which is configured by Kconfig
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
*
* @return afe_config_t* The default config of afe
*/
afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
#if CONFIG_IDF_TARGET_ESP32
#define AFE_CONFIG_DEFAULT() { \
.aec_init = true, \
.se_init = true, \
.vad_init = true, \
.wakenet_init = true, \
.voice_communication_init = false, \
.voice_communication_agc_init = false, \
.voice_communication_agc_gain = 15, \
.vad_mode = VAD_MODE_3, \
.wakenet_model_name = NULL, \
.wakenet_model_name_2 = NULL, \
.wakenet_mode = DET_MODE_90, \
.afe_mode = SR_MODE_HIGH_PERF, \
.afe_perferred_core = 0, \
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.memory_alloc_mode = AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE, \
.afe_linear_gain = 1.0, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config = { \
.total_ch_num = 2, \
.mic_num = 1, \
.ref_num = 1, \
.sample_rate = 16000, \
}, \
.debug_init = false, \
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
.afe_ns_mode = NS_MODE_SSP, \
.afe_ns_model_name = NULL, \
.fixed_first_channel = true, \
.vad_model_name = NULL, \
.vad_min_speech_ms = 64, \
.vad_min_noise_ms = 256, \
.vad_mute_playback = false, \
}
#elif CONFIG_IDF_TARGET_ESP32P4
#define AFE_CONFIG_DEFAULT() { \
.aec_init = true, \
.se_init = true, \
.vad_init = true, \
.wakenet_init = true, \
.voice_communication_init = false, \
.voice_communication_agc_init = false, \
.voice_communication_agc_gain = 15, \
.vad_mode = VAD_MODE_3, \
.wakenet_model_name = NULL, \
.wakenet_model_name_2 = NULL, \
.wakenet_mode = DET_MODE_90, \
.afe_mode = SR_MODE_LOW_COST, \
.afe_perferred_core = 0, \
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
.afe_linear_gain = 1.0, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config = { \
.total_ch_num = 2, \
.mic_num = 1, \
.ref_num = 1, \
.sample_rate = 16000, \
}, \
.debug_init = false, \
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
.afe_ns_mode = NS_MODE_SSP, \
.afe_ns_model_name = NULL, \
.fixed_first_channel = true, \
.vad_model_name = NULL, \
.vad_min_speech_ms = 64, \
.vad_min_noise_ms = 256, \
.vad_mute_playback = false, \
}
#elif CONFIG_IDF_TARGET_ESP32S3
#define AFE_CONFIG_DEFAULT() { \
.aec_init = true, \
.se_init = true, \
.vad_init = true, \
.wakenet_init = true, \
.voice_communication_init = false, \
.voice_communication_agc_init = false, \
.voice_communication_agc_gain = 15, \
.vad_mode = VAD_MODE_3, \
.wakenet_model_name = NULL, \
.wakenet_model_name_2 = NULL, \
.wakenet_mode = DET_MODE_2CH_90, \
.afe_mode = SR_MODE_LOW_COST, \
.afe_perferred_core = 0, \
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
.afe_linear_gain = 1.0, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config = { \
.total_ch_num = 3, \
.mic_num = 2, \
.ref_num = 1, \
.sample_rate = 16000, \
}, \
.debug_init = false, \
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
.afe_ns_mode = NS_MODE_SSP, \
.afe_ns_model_name = NULL, \
.fixed_first_channel = true, \
.vad_model_name = NULL, \
.vad_min_speech_ms = 64, \
.vad_min_noise_ms = 256, \
.vad_mute_playback = false, \
}
#endif
/**
* @brief Check AFE configuration and make sure it is correct.
*
* @warning If there is a configuration conflict, this function will modify some parameters.
* The guiding behind these modifications is to maintain the highest performance of the output audio and results.
* And remove the conflict between different algorithms.
*
* For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
* If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
*
* @param afe_config Input AFE config
*
* @return afe_config_t* The modified AFE config
*/
afe_config_t *afe_config_check(afe_config_t *afe_config);
/**
* @brief Parse input format
*
* @param input_format The input format, same with afe_config_init() function
* @param pcm_config The pcm config
*
* @return true if the input format is parsed successfully, otherwise false
*/
bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config);
/**
* @brief Parse I2S input data
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param mic_data The output microphone data
* @param ref_data The output playback reference data
* @param pcm_config The pcm config
*
*/
void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config);
/**
* @brief Parse input data, from interleaved arrangement to contiguous arrangement
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param channel_num The channel number of data
* @param out_data The output data
*
*/
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
/**
* @brief Format input data, from contiguous arrangement to interleaved arrangement
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param channel_num The channel number of data
* @param out_data The output data
*
*/
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
/**
* @brief Adjust the gain of input data
*
* @warning the input data will be modified inplace.
*
* @param data The input audio data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param factor The gain factor
*
* @return int16_t* The output audio data
*/
int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
/**
* @brief Adjust the gain of input data
*
* @warning the input data will be modified inplace.
*
* @param in_data The input audio data
* @param in_frame_size Input data frame size of input
* @param channel_num The channel number of input data, which is same as output data
* @param out_data The output audio data
* @param out_frame_size Onput data frame size of input
*
*/
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size);
/**
* @brief Copy the afe config
*
* @param dst_config The destination afe config
* @param src_config The source afe config
*
* @return The destination afe config
*/
afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
/**
* @brief Print the afe config
*
* @param afe_config The afe config
*/
void afe_config_print(const afe_config_t *afe_config);
/**
* @brief Allocate afe config
*
* @return The afe config pointer
*/
afe_config_t *afe_config_alloc();
/**
* @brief Free afe config
*
* @param afe_config The afe config pointer
*/
void afe_config_free(afe_config_t *afe_config);
#ifdef __cplusplus
}

View File

@ -1,7 +1,10 @@
#pragma once
#include "stdint.h"
#include "stdlib.h"
#include "stdbool.h"
#include "esp_afe_config.h"
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#ifdef __cplusplus
extern "C" {
#endif
@ -13,13 +16,15 @@ extern "C" {
//Opaque AFE_SR data container
typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
/**
* @brief The state of vad
*/
typedef enum
{
AFE_VAD_SILENCE = 0, // noise or silence
AFE_VAD_SPEECH // speech
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
} afe_vad_state_t;
/**
@ -27,7 +32,7 @@ typedef enum
*/
typedef struct afe_fetch_result_t
{
int16_t *data; // the data of audio.
int16_t *data; // the target channel data of audio.
int data_size; // the size of data. The unit is byte.
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
int vad_cache_size; // the size of vad_cache. The unit is byte.
@ -36,10 +41,12 @@ typedef struct afe_fetch_result_t
wakenet_state_t wakeup_state; // the value is wakenet_state_t
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
afe_vad_state_t vad_state; // the value is afe_vad_state_t
vad_state_t vad_state; // the value is afe_vad_state_t
int trigger_channel_id; // the channel index of output
int wake_word_length; // the length of wake word. The unit is the number of samples.
int ret_value; // the return state of fetch function
int16_t *raw_data; // the multi-channel output data of audio.
int raw_data_channels; // the channel number of raw data
void* reserved; // reserved for future use
} afe_fetch_result_t;
@ -63,19 +70,11 @@ typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_confi
typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
/**
* @brief Get the total channel number which be config
* @brief Get the channel number
*
* @param afe The AFE_SR object to query
* @return The amount of total channels
*/
typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe);
/**
* @brief Get the mic channel number which be config
*
* @param afe The AFE_SR object to query
* @return The amount of mic channels
*/
typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
/**
@ -104,12 +103,24 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t*
* @brief fetch enhanced samples of an audio stream from the AFE_SR
*
* @Warning The output is single channel data, no matter how many channels the input is.
* Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
*
* @param afe The AFE_SR object to query
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
*/
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
/**
* @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
*
* @Warning The output is single channel data, no matter how many channels the input is.
*
* @param afe The AFE_SR object to query
* @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result.
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
*/
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
/**
* @brief reset ringbuf of AFE.
*
@ -129,52 +140,37 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name);
/**
* @brief Disable wakenet model.
* @brief Enable VAD algorithm.
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_disable_wakenet_t)(esp_afe_sr_data_t *afe);
typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe);
/**
* @brief Enable wakenet model.
* @brief Disable one function/module/algorithm.
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_enable_wakenet_t)(esp_afe_sr_data_t *afe);
typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe);
/**
* @brief Disable AEC algorithm.
* @brief Enable one function/module/algorithm.
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_disable_aec_t)(esp_afe_sr_data_t *afe);
typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe);
/**
* @brief Enable AEC algorithm.
* @brief Print all functions/modules/algorithms pipeline.
* The pipeline is the order of the functions/modules/algorithms.
* The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output]
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_enable_aec_t)(esp_afe_sr_data_t *afe);
/**
* @brief Disable SE algorithm.
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_disable_se_t)(esp_afe_sr_data_t *afe);
/**
* @brief Enable SE algorithm.
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_enable_se_t)(esp_afe_sr_data_t *afe);
typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
/**
* @brief Destroy a AFE_SR instance
@ -191,22 +187,41 @@ typedef struct {
esp_afe_sr_iface_op_create_from_config_t create_from_config;
esp_afe_sr_iface_op_feed_t feed;
esp_afe_sr_iface_op_fetch_t fetch;
esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay;
esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
esp_afe_sr_iface_op_get_total_channel_num_t get_total_channel_num;
esp_afe_sr_iface_op_get_channel_num_t get_channel_num;
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
esp_afe_sr_iface_op_disable_wakenet_t disable_wakenet;
esp_afe_sr_iface_op_enable_wakenet_t enable_wakenet;
esp_afe_sr_iface_op_disable_aec_t disable_aec;
esp_afe_sr_iface_op_enable_aec_t enable_aec;
esp_afe_sr_iface_op_disable_se_t disable_se;
esp_afe_sr_iface_op_enable_se_t enable_se;
esp_afe_sr_iface_op_disable_func_t disable_wakenet;
esp_afe_sr_iface_op_enable_func_t enable_wakenet;
esp_afe_sr_iface_op_disable_func_t disable_aec;
esp_afe_sr_iface_op_enable_func_t enable_aec;
esp_afe_sr_iface_op_disable_func_t disable_se;
esp_afe_sr_iface_op_enable_func_t enable_se;
esp_afe_sr_iface_op_disable_func_t disable_vad;
esp_afe_sr_iface_op_enable_func_t enable_vad;
esp_afe_sr_iface_op_disable_func_t disable_ns;
esp_afe_sr_iface_op_enable_func_t enable_ns;
esp_afe_sr_iface_op_disable_func_t disable_agc;
esp_afe_sr_iface_op_enable_func_t enable_agc;
esp_afe_sr_iface_op_print_pipeline_t print_pipeline;
esp_afe_sr_iface_op_destroy_t destroy;
} esp_afe_sr_iface_t;
// struct is used to store the AFE handle and data for the AFE task
typedef struct
{
esp_afe_sr_data_t *afe_data;
esp_afe_sr_iface_t *afe_handle;
TaskHandle_t feed_task;
TaskHandle_t fetch_task;
}afe_task_into_t;
#ifdef __cplusplus
}
#endif

View File

@ -6,17 +6,7 @@ extern "C" {
#include "esp_afe_sr_iface.h"
#if CONFIG_AFE_INTERFACE_V1
extern const esp_afe_sr_iface_t esp_afe_sr_v1;
extern const esp_afe_sr_iface_t esp_afe_vc_v1;
#define ESP_AFE_SR_HANDLE esp_afe_sr_v1
#define ESP_AFE_VC_HANDLE esp_afe_vc_v1
#else
#error No valid afe selected.
#endif
esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
#ifdef __cplusplus
}

View File

@ -26,8 +26,15 @@ typedef enum {
ESP_AGC_FRAME_SIZE_ERROR = -3, ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size
} ESP_AGE_ERR;
typedef enum {
AGC_MODE_SR = -1, // Bypass WEBRTC AGC
AGC_MODE_0 = 0, // Only saturation protection
AGC_MODE_1 = 1, // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
AGC_MODE_2 = 2, // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
AGC_MODE_3 = 3, // Fixed Digital Gain [compressionGaindB (default 8 dB)]
} agc_mode_t;
void *esp_agc_open(int agc_mode, int sample_rate);
void *esp_agc_open(agc_mode_t agc_mode, int sample_rate);
void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs);
int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate);
void esp_agc_close(void *agc_handle);

View File

@ -78,6 +78,8 @@ vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
typedef struct {
vad_trigger_t *trigger;
void *vad_inst;
int sample_rate;
int frame_size;
}vad_handle_with_trigger_t;
typedef vad_handle_with_trigger_t* vad_handle_t;
@ -100,31 +102,41 @@ vad_handle_t vad_create(vad_mode_t vad_mode);
* @brief Creates an instance to the VAD structure.
*
* @param vad_mode Sets the VAD operating mode.
* @param min_speech_len Minimum frame number of speech duration
* @param min_noise_len Minimum frame number of noise duration
* @param sample_rate Sample rate in Hz
* @param one_frame_ms Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30.
* @param min_speech_ms Minimum speech duration, unit is ms
* @param min_noise_ms Minimum noise duration, unit is ms
* @return
* - NULL: Create failed
* - Others: The instance of VAD
*/
vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len);
vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len);
/**
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
*
* @param inst The instance of VAD.
*
* @param data An array of 16-bit signed audio samples.
*
* @param handle The instance of VAD.
* @param data An array of 16-bit signed audio samples.
* @param sample_rate_hz The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
*
* @param one_frame_ms The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
*
* @return
* - VAD_SILENCE if no voice
* - VAD_SPEECH if voice is detected
*
*/
vad_state_t vad_process(vad_handle_t inst, int16_t *data, int sample_rate_hz, int one_frame_ms);
vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms);
/**
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
*
* @param handle The instance of VAD.
* @param data An array of 16-bit signed audio samples.
* @return
* - VAD_SILENCE if no voice
* - VAD_SPEECH if voice is detected
*
*/
vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
/**
* @brief Free the VAD instance

View File

@ -0,0 +1,164 @@
#pragma once
#include "esp_vad.h"
#include "stdint.h"
#include "dl_lib_convq_queue.h"
#ifdef __cplusplus
extern "C" {
#endif
// Opaque model data container
typedef struct model_iface_data_t model_iface_data_t;
// /**
// * @brief The state of vad
// */
// typedef enum {
// VAD_NOISE = -1, // Noise
// VADNET_STATE_SILENCE = 0, // Silence
// VAD_SPEECH = 1 // Speech
// } vad_state_t;
/**
* @brief Easy function type to initialze a model instance with a detection mode
* and specified model name
*
* @param model_name The specified model name
* @param mode The voice activity detection mode
* @param channel_num The number of input audio channels
* @param min_speech_ms The minimum duration of speech in ms to trigger vad
* speech
* @param min_noise_ms The minimum duration of noise in ms to trigger vad
* noise
* @returns Handle to the model data
*/
typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)(
const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms);
/**
* @brief Get the amount of samples that need to be passed to the detect
* function
*
* Every speech recognition model processes a certain number of samples at the
* same time. This function can be used to query that amount. Note that the
* returned amount is in 16-bit samples, not in bytes.
*
* @param model The model object to query
* @return The amount of samples to feed the detect function
*/
typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
/**
* @brief Get the channel number of samples that need to be passed to the detect
* function
*
* Every speech recognition model processes a certain number of samples at the
* same time. This function can be used to query that amount. Note that the
* returned amount is in 16-bit samples, not in bytes.
*
* @param model The model object to query
* @return The amount of samples to feed the detect function
*/
typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model);
/**
* @brief Get the sample rate of the samples to feed to the detect function
*
* @param model The model object to query
* @return The sample rate, in hz
*/
typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
/**
* @brief Set the detection threshold to manually abjust the probability
*
* @param model The model object to query
* @param det_treshold The threshold to trigger wake words, the range of
* det_threshold is 0.5~0.9999
* @return 0: setting failed, 1: setting success
*/
typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
/**
* @brief Get the voice activity detection threshold
*
* @param model The model object to query
* @returns the detection threshold
*/
typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model);
/**
* @brief Feed samples of an audio stream to the vad model and detect whether is
* voice.
*
* @param model The model object to query
* @param samples An array of 16-bit signed audio samples. The array size used
* can be queried by the get_samp_chunksize function.
* @return The index of wake words, return 0 if no wake word is detected, else
* the index of the wake words.
*/
typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
/**
* @brief Feed MFCC of an audio stream to the vad model and detect whether is
* voice.
*
* @param model The model object to query
* @param cq An array of 16-bit MFCC.
* @return The index of wake words, return 0 if no wake word is detected, else
* the index of the wake words.
*/
typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq);
/**
* @brief Get MFCC of an audio stream
*
* @param model The model object to query
* @return MFCC data
*/
typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
/**
* @brief Get the triggered channel index. Channel index starts from zero
*
* @param model The model object to query
* @return The channel index
*/
typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
/**
* @brief Clean all states of model
*
* @param model The model object to query
*/
typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model);
/**
* @brief Destroy a model object
*
* @param model Model object to destroy
*/
typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model);
/**
* This structure contains the functions used to do operations on a voice
* activity detection model.
*/
typedef struct {
esp_vadn_iface_op_create_t create;
esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize;
esp_vadn_iface_op_get_channel_num_t get_channel_num;
esp_vadn_iface_op_get_samp_rate_t get_samp_rate;
esp_vadn_iface_op_set_det_threshold_t set_det_threshold;
esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
esp_vadn_iface_op_detect_t detect;
esp_vadn_iface_op_detect_mfcc_t detect_mfcc;
esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data;
esp_vadn_iface_op_clean_t clean;
esp_vadn_iface_op_destroy_t destroy;
} esp_vadn_iface_t;
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,22 @@
#pragma once
#include "esp_vadn_iface.h"
#ifdef __cplusplus
extern "C" {
#endif
// The prefix of vadnet model name is used to filter all wakenet from availabel models.
#define ESP_VADN_PREFIX "vadnet"
/**
* @brief Get the wakenet handle from model name
*
* @param model_name The name of model
* @returns The handle of wakenet
*/
const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name);
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,90 @@
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License
#ifndef _ESP_WEBRTC_H_
#define _ESP_WEBRTC_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdint.h>
#include "sr_ringbuf.h"
#include "esp_log.h"
#include "esp_agc.h"
#include "esp_ns.h"
#include "esp_heap_caps.h"
typedef struct {
void* ns_handle;
void* agc_handle;
int frame_size;
int sample_rate;
int16_t *buff;
int16_t *out_data;
sr_ringbuf_handle_t rb;
}webrtc_handle_t;
/**
* @brief Creates an instance of webrtc.
*
* @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms.
*
* @param frame_length_ms The length of the audio processing
* @param ns_mode The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive
* @param agc_mode The model of AGC
* @param agc_gain The gain of AGC. default is 9
* @param agc_target_level The target level of AGC. default is -3 dbfs
* @param sample_rate The sample rate of the audio.
*
* @return
* - NULL: Create failed
* - Others: The instance of webrtc
*/
webrtc_handle_t* webrtc_create(
int frame_length_ms,
int ns_mode,
agc_mode_t agc_mode,
int agc_gain,
int agc_target_level,
int sample_rate);
/**
* @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression.
*
* @param handle The instance of NS.
* @param in_data An array of 16-bit signed audio samples.
* @param out_size The sample size of output data
* @param enable_ns Enable noise suppression
* @param enable_agc Enable automatic gain control
*
* @return data after noise suppression
*/
int16_t* webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc);
/**
* @brief Free the webrtc instance
*
* @param handle The instance of webrtc.
*
* @return None
*
*/
void webrtc_destroy(webrtc_handle_t *handle);
#ifdef __cplusplus
}
#endif
#endif //_ESP_NS_H_

View File

@ -1,5 +1,6 @@
#pragma once
#include "stdint.h"
#include "dl_lib_convq_queue.h"
#ifdef __cplusplus
extern "C" {
@ -167,6 +168,25 @@ typedef void (*esp_wn_iface_op_clean_t)(model_iface_data_t *model);
*/
typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model);
/**
* @brief Feed MFCC of an audio stream to the vad model and detect whether is
* voice.
*
* @param model The model object to query
* @param cq An array of 16-bit MFCC.
* @return The index of wake words, return 0 if no wake word is detected, else
* the index of the wake words.
*/
typedef wakenet_state_t (*esp_wn_iface_op_detect_mfcc_t)(model_iface_data_t *model, int16_t *samples, dl_convq_queue_t *cq);
/**
* @brief Get MFCC of an audio stream
*
* @param model The model object to query
* @return MFCC data
*/
typedef dl_convq_queue_t* (*esp_wn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
/**
* This structure contains the functions used to do operations on a wake word detection model.
@ -184,6 +204,8 @@ typedef struct {
esp_wn_iface_op_get_triggered_channel_t get_triggered_channel;
esp_wn_iface_op_get_vol_gain_t get_vol_gain;
esp_wn_iface_op_detect_t detect;
esp_wn_iface_op_detect_mfcc_t detect_mfcc;
esp_wn_iface_op_get_mfcc_data_t get_mfcc_data;
esp_wn_iface_op_clean_t clean;
esp_wn_iface_op_destroy_t destroy;
} esp_wn_iface_t;

View File

@ -21,80 +21,72 @@ extern "C" {
#endif
#define USE_AEC_FFT // Not kiss_fft
#define AEC_USE_SPIRAM 0
#define AEC_SAMPLE_RATE 16000 // Only Support 16000Hz
//#define AEC_FRAME_LENGTH_MS 16
#define AEC_FRAME_LENGTH_MS 32
#define AEC_FILTER_LENGTH 1200 // Number of samples of echo to cancel
typedef void* aec_handle_t;
typedef struct aec_handle_t aec_handle_t;
typedef enum {
AEC_MODE_SR_LOW_COST = 0, // Low Cost AEC fro speech recognition
AEC_MODE_SR_HIGH_PERF = 1, // High Perforamce AEC for speech recognition
AEC_MODE_VOIP_LOW_COST = 3, // Low Cost AEC for voice communication
AEC_MODE_VOIP_HIGH_PERF = 4, // High Perforamce AEC for voice communication
} aec_mode_t;
/**
* @brief Creates an instance to the AEC structure.
* Please get frame size by aec_get_chunksize() function
*
* @deprecated This API will be deprecated after version 1.0, please use aec_pro_create
*
* @param sample_rate The Sampling frequency (Hz) must be 16000.
*
* @param frame_length The length of the audio processing must be 16ms.
*
* @param filter_length Number of samples of echo to cancel.
*
* @param filter_length Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
* @param channel_num The input microphone channel number
* @param mode The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
* @return
* - NULL: Create failed
* - Others: The instance of AEC
*/
aec_handle_t aec_create(int sample_rate, int frame_length, int filter_length);
aec_handle_t *aec_create(int sample_rate, int filter_length, int channel_num, aec_mode_t mode);
/**
* @brief Creates an instance to the AEC structure.
* @brief Creates an instance to the AEC structure, same with aec_create().
*
* @deprecated This API will be deprecated after version 1.0, please use aec_pro_create
*
* @param sample_rate The Sampling frequency (Hz) must be 16000.
*
* @param frame_length The length of the audio processing must be 16ms.
*
* @param filter_length Number of samples of echo to cancel.
*
* @param nch Number of input signal channel.
*
* @param filter_length Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
* @param channel_num The input microphone channel number
* @param mode The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
* @return
* - NULL: Create failed
* - Others: The instance of AEC
*/
aec_handle_t aec_create_multimic(int sample_rate, int frame_length, int filter_length, int nch);
/**
* @brief Creates an instance of more powerful AEC.
*
* @param frame_length Length of input signal. Must be 16ms if mode is 0; otherwise could be 16ms or 32ms. Length of input signal to aec_process must be modified accordingly.
*
* @param nch Number of microphones.
*
* @param mode Mode of AEC (0 to 5), indicating aggressiveness and RAM allocation. 0: mild; 1 or 2: medium (1: internal RAM, 2: SPIRAM); 3 and 4: aggressive (3: internal RAM, 4: SPIRAM); 5: agressive, accelerated for ESP32-S3.
*
* @return
* - NULL: Create failed
* - Others: An Instance of AEC
*/
aec_handle_t aec_pro_create(int frame_length, int nch, int mode);
aec_handle_t *aec_pro_create(int filter_length, int channel_num, aec_mode_t mode);
/**
* @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
*
* @param inst The instance of AEC.
*
* @warning The indata, refdata and outdata must be 16-bit signed. please allocate memory by heap_caps_aligned_alloc().
*
* @param inst The instance of AEC. Format for multi-channel data is "ch0 ch0 ch0 ..., ch1 ch1 ch1 ..."
* @param indata An array of 16-bit signed audio samples from mic.
*
* @param refdata An array of 16-bit signed audio samples sent to the speaker.
*
* @param outdata Returns near-end signal with echo removed.
*
* @param outdata Returns near-end signal with echo removed. Format for multi-channel data is "ch0 ch0 ch0..., ch1 ch1 ch1 ..."
* @return None
*
*/
void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int16_t *outdata);
void aec_process(const aec_handle_t *handel, int16_t *indata, int16_t *refdata, int16_t *outdata);
/**
* @brief Get frame size of AEC (the samples of one frame)
* @param handle The instance of AEC.
* @return Frame size
*/
int aec_get_chunksize(const aec_handle_t *handle);
/**
* @brief Get AEC mode string
*
* @param aec_mode The mode of AEC.
*
* @return AEC mode string
*/
char * aec_get_mode_string(aec_mode_t aec_mode);
/**
* @brief Free the AEC instance
@ -104,7 +96,7 @@ void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int
* @return None
*
*/
void aec_destroy(aec_handle_t inst);
void aec_destroy(aec_handle_t *handel);
#ifdef __cplusplus
}

View File

@ -1,24 +1,41 @@
#pragma once
#include "stdint.h"
#include "stdbool.h"
#include "stdlib.h"
#include "esp_wn_iface.h"
#include "esp_wn_models.h"
#include "esp_vad.h"
#include "esp_aec.h"
#include "esp_agc.h"
#include "model_path.h"
#include "esp_vadn_models.h"
#include "esp_nsn_models.h"
#ifdef __cplusplus
extern "C" {
#endif
//AFE: Audio Front-End
//SR: Speech Recognition
//afe_sr/AFE_SR: the audio front-end for speech recognition
//VC: Voice Communication
//Set AFE_SR mode
typedef enum {
SR_MODE_LOW_COST = 0,
SR_MODE_HIGH_PERF = 1
SR_MODE_LOW_COST = 0, //Deprecated, please use afe_mode_t, AFE mode: low cost mode
SR_MODE_HIGH_PERF = 1, //Deprecated, please use afe_mode_t, AFE mode: high performance mode
} afe_sr_mode_t;
//Set AFE mode
typedef enum {
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
} afe_mode_t;
//Set AFE type
typedef enum {
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
} afe_type_t;
typedef enum {
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
@ -26,24 +43,30 @@ typedef enum {
} afe_memory_alloc_mode_t;
typedef enum {
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of audio fed to multinet is -9dB
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of audio fed to multinet is -6dB
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of audio fed to multinet is -3dB
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
} afe_mn_peak_agc_mode_t;
typedef struct {
int total_ch_num; // total channel num. It must be: total_ch_num = mic_num + ref_num
int mic_num; // mic channel num
int ref_num; // reference channel num
int sample_rate; // sample rate of audio
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
int mic_num; // microphone channel number
uint8_t* mic_ids; // microphone channel indices
int ref_num; // playback reference channel number
uint8_t* ref_ids; // playback reference channel indices
int sample_rate; // sample rate of audio
} afe_pcm_config_t;
typedef enum {
NS_MODE_SSP = 0, // speech signal process method
NS_MODE_NET = 1, // deep noise suppression net method
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
AFE_NS_MODE_NET = 1, // please use model name of NSNET
} afe_ns_mode_t;
typedef enum {
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
} afe_agc_mode_t;
/**
* @brief Function to get the debug audio data
@ -66,148 +89,192 @@ typedef struct {
} afe_debug_hook_t;
typedef struct {
bool aec_init;
bool se_init;
bool vad_init;
/********** AEC(Acoustic Echo Cancellation) **********/
bool aec_init; // Whether to init aec
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
int aec_filter_length; // The filter length of aec
/********** SE(Speech Enhancement, microphone array processing) **********/
bool se_init; // Whether to init se
/********** NS(Noise Suppression) **********/
bool ns_init; // Whether to init ns
char *ns_model_name; // Model name of ns
afe_ns_mode_t afe_ns_mode; // Model mode of ns
/********** VAD(Voice Activity Detection) **********/
bool vad_init; // Whether to init vad
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
/********** WakeNet(Wake Word Engine) **********/
bool wakenet_init;
bool voice_communication_init;
bool voice_communication_agc_init; // AGC swich for voice communication
int voice_communication_agc_gain; // AGC gain(dB) for voice communication
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
char *wakenet_model_name; // The model name of wakenet 1
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
det_mode_t wakenet_mode;
afe_sr_mode_t afe_mode;
int afe_perferred_core;
int afe_perferred_priority;
int afe_ringbuf_size;
afe_memory_alloc_mode_t memory_alloc_mode;
float afe_linear_gain; // The linear gain for sr output(note: invaild for vc), the value should be in [0.1, 10.0].
// This value acts directly on the output amplitude: out_linear_gain * amplitude.
afe_mn_peak_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
det_mode_t wakenet_mode; // The mode of wakenet
/********** AGC(Automatic Gain Control) **********/
bool agc_init; // Whether to init agc
afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
int agc_compression_gain_db; // Compression gain in dB (default 9)
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
/********** General AFE(Audio Front End) parameter **********/
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
afe_mode_t afe_mode; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
afe_type_t afe_type; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude.
bool debug_init;
afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX];
afe_ns_mode_t afe_ns_mode;
char *afe_ns_model_name;
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
// otherwise, select channel number by wakenet
char *vad_model_name; // The model name of vad, support vadnet1 and vadnet1_small
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms
int vad_min_noise_ms; // The minimum duration of noise/silence in ms. It should be bigger than 64 ms
bool vad_mute_playback; // If true, the playback will be muted for vad detection
} afe_config_t;
/**
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format.
* You can manually fine-tune it after creating the configuration
*
* The input format:
* M to represent the microphone channel
* R to represent the playback reference channel
* N to represent an unknown or unused channel
*
* For example, input_format="MMNR" indicates that the input data consists of four channels,
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
*
* @param input_format The input format
* @param models Models from partition, which is configured by Kconfig
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
*
* @return afe_config_t* The default config of afe
*/
afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
#if CONFIG_IDF_TARGET_ESP32
#define AFE_CONFIG_DEFAULT() { \
.aec_init = true, \
.se_init = true, \
.vad_init = true, \
.wakenet_init = true, \
.voice_communication_init = false, \
.voice_communication_agc_init = false, \
.voice_communication_agc_gain = 15, \
.vad_mode = VAD_MODE_3, \
.wakenet_model_name = NULL, \
.wakenet_model_name_2 = NULL, \
.wakenet_mode = DET_MODE_90, \
.afe_mode = SR_MODE_HIGH_PERF, \
.afe_perferred_core = 0, \
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.memory_alloc_mode = AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE, \
.afe_linear_gain = 1.0, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config = { \
.total_ch_num = 2, \
.mic_num = 1, \
.ref_num = 1, \
.sample_rate = 16000, \
}, \
.debug_init = false, \
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
.afe_ns_mode = NS_MODE_SSP, \
.afe_ns_model_name = NULL, \
.fixed_first_channel = true, \
.vad_model_name = NULL, \
.vad_min_speech_ms = 64, \
.vad_min_noise_ms = 256, \
.vad_mute_playback = false, \
}
#elif CONFIG_IDF_TARGET_ESP32P4
#define AFE_CONFIG_DEFAULT() { \
.aec_init = true, \
.se_init = true, \
.vad_init = true, \
.wakenet_init = true, \
.voice_communication_init = false, \
.voice_communication_agc_init = false, \
.voice_communication_agc_gain = 15, \
.vad_mode = VAD_MODE_3, \
.wakenet_model_name = NULL, \
.wakenet_model_name_2 = NULL, \
.wakenet_mode = DET_MODE_90, \
.afe_mode = SR_MODE_LOW_COST, \
.afe_perferred_core = 0, \
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
.afe_linear_gain = 1.0, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config = { \
.total_ch_num = 2, \
.mic_num = 1, \
.ref_num = 1, \
.sample_rate = 16000, \
}, \
.debug_init = false, \
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
.afe_ns_mode = NS_MODE_SSP, \
.afe_ns_model_name = NULL, \
.fixed_first_channel = true, \
.vad_model_name = NULL, \
.vad_min_speech_ms = 64, \
.vad_min_noise_ms = 256, \
.vad_mute_playback = false, \
}
#elif CONFIG_IDF_TARGET_ESP32S3
#define AFE_CONFIG_DEFAULT() { \
.aec_init = true, \
.se_init = true, \
.vad_init = true, \
.wakenet_init = true, \
.voice_communication_init = false, \
.voice_communication_agc_init = false, \
.voice_communication_agc_gain = 15, \
.vad_mode = VAD_MODE_3, \
.wakenet_model_name = NULL, \
.wakenet_model_name_2 = NULL, \
.wakenet_mode = DET_MODE_2CH_90, \
.afe_mode = SR_MODE_LOW_COST, \
.afe_perferred_core = 0, \
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
.afe_linear_gain = 1.0, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config = { \
.total_ch_num = 3, \
.mic_num = 2, \
.ref_num = 1, \
.sample_rate = 16000, \
}, \
.debug_init = false, \
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
.afe_ns_mode = NS_MODE_SSP, \
.afe_ns_model_name = NULL, \
.fixed_first_channel = true, \
.vad_model_name = NULL, \
.vad_min_speech_ms = 64, \
.vad_min_noise_ms = 256, \
.vad_mute_playback = false, \
}
#endif
/**
* @brief Check AFE configuration and make sure it is correct.
*
* @warning If there is a configuration conflict, this function will modify some parameters.
* The guiding behind these modifications is to maintain the highest performance of the output audio and results.
* And remove the conflict between different algorithms.
*
* For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
* If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
*
* @param afe_config Input AFE config
*
* @return afe_config_t* The modified AFE config
*/
afe_config_t *afe_config_check(afe_config_t *afe_config);
/**
* @brief Parse input format
*
* @param input_format The input format, same with afe_config_init() function
* @param pcm_config The pcm config
*
* @return true if the input format is parsed successfully, otherwise false
*/
bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config);
/**
* @brief Parse I2S input data
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param mic_data The output microphone data
* @param ref_data The output playback reference data
* @param pcm_config The pcm config
*
*/
void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config);
/**
* @brief Parse input data, from interleaved arrangement to contiguous arrangement
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param channel_num The channel number of data
* @param out_data The output data
*
*/
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
/**
* @brief Format input data, from contiguous arrangement to interleaved arrangement
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param channel_num The channel number of data
* @param out_data The output data
*
*/
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
/**
* @brief Adjust the gain of input data
*
* @warning the input data will be modified inplace.
*
* @param data The input audio data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param factor The gain factor
*
* @return int16_t* The output audio data
*/
int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
/**
* @brief Adjust the gain of input data
*
* @warning the input data will be modified inplace.
*
* @param in_data The input audio data
* @param in_frame_size Input data frame size of input
* @param channel_num The channel number of input data, which is same as output data
* @param out_data The output audio data
* @param out_frame_size Onput data frame size of input
*
*/
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size);
/**
* @brief Copy the afe config
*
* @param dst_config The destination afe config
* @param src_config The source afe config
*
* @return The destination afe config
*/
afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
/**
* @brief Print the afe config
*
* @param afe_config The afe config
*/
void afe_config_print(const afe_config_t *afe_config);
/**
* @brief Allocate afe config
*
* @return The afe config pointer
*/
afe_config_t *afe_config_alloc();
/**
* @brief Free afe config
*
* @param afe_config The afe config pointer
*/
void afe_config_free(afe_config_t *afe_config);
#ifdef __cplusplus
}

View File

@ -1,7 +1,10 @@
#pragma once
#include "stdint.h"
#include "stdlib.h"
#include "stdbool.h"
#include "esp_afe_config.h"
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#ifdef __cplusplus
extern "C" {
#endif
@ -13,13 +16,15 @@ extern "C" {
//Opaque AFE_SR data container
typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
/**
* @brief The state of vad
*/
typedef enum
{
AFE_VAD_SILENCE = 0, // noise or silence
AFE_VAD_SPEECH // speech
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
} afe_vad_state_t;
/**
@ -27,7 +32,7 @@ typedef enum
*/
typedef struct afe_fetch_result_t
{
int16_t *data; // the data of audio.
int16_t *data; // the target channel data of audio.
int data_size; // the size of data. The unit is byte.
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
int vad_cache_size; // the size of vad_cache. The unit is byte.
@ -36,10 +41,12 @@ typedef struct afe_fetch_result_t
wakenet_state_t wakeup_state; // the value is wakenet_state_t
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
afe_vad_state_t vad_state; // the value is afe_vad_state_t
vad_state_t vad_state; // the value is afe_vad_state_t
int trigger_channel_id; // the channel index of output
int wake_word_length; // the length of wake word. The unit is the number of samples.
int ret_value; // the return state of fetch function
int16_t *raw_data; // the multi-channel output data of audio.
int raw_data_channels; // the channel number of raw data
void* reserved; // reserved for future use
} afe_fetch_result_t;
@ -63,19 +70,11 @@ typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_confi
typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
/**
* @brief Get the total channel number which be config
* @brief Get the channel number
*
* @param afe The AFE_SR object to query
* @return The amount of total channels
*/
typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe);
/**
* @brief Get the mic channel number which be config
*
* @param afe The AFE_SR object to query
* @return The amount of mic channels
*/
typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
/**
@ -104,12 +103,24 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t*
* @brief fetch enhanced samples of an audio stream from the AFE_SR
*
* @Warning The output is single channel data, no matter how many channels the input is.
* Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
*
* @param afe The AFE_SR object to query
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
*/
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
/**
* @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
*
* @Warning The output is single channel data, no matter how many channels the input is.
*
* @param afe The AFE_SR object to query
* @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result.
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
*/
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
/**
* @brief reset ringbuf of AFE.
*
@ -129,52 +140,37 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name);
/**
* @brief Disable wakenet model.
* @brief Enable VAD algorithm.
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_disable_wakenet_t)(esp_afe_sr_data_t *afe);
typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe);
/**
* @brief Enable wakenet model.
* @brief Disable one function/module/algorithm.
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_enable_wakenet_t)(esp_afe_sr_data_t *afe);
typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe);
/**
* @brief Disable AEC algorithm.
* @brief Enable one function/module/algorithm.
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_disable_aec_t)(esp_afe_sr_data_t *afe);
typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe);
/**
* @brief Enable AEC algorithm.
* @brief Print all functions/modules/algorithms pipeline.
* The pipeline is the order of the functions/modules/algorithms.
* The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output]
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_enable_aec_t)(esp_afe_sr_data_t *afe);
/**
* @brief Disable SE algorithm.
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_disable_se_t)(esp_afe_sr_data_t *afe);
/**
* @brief Enable SE algorithm.
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_enable_se_t)(esp_afe_sr_data_t *afe);
typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
/**
* @brief Destroy a AFE_SR instance
@ -191,22 +187,41 @@ typedef struct {
esp_afe_sr_iface_op_create_from_config_t create_from_config;
esp_afe_sr_iface_op_feed_t feed;
esp_afe_sr_iface_op_fetch_t fetch;
esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay;
esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
esp_afe_sr_iface_op_get_total_channel_num_t get_total_channel_num;
esp_afe_sr_iface_op_get_channel_num_t get_channel_num;
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
esp_afe_sr_iface_op_disable_wakenet_t disable_wakenet;
esp_afe_sr_iface_op_enable_wakenet_t enable_wakenet;
esp_afe_sr_iface_op_disable_aec_t disable_aec;
esp_afe_sr_iface_op_enable_aec_t enable_aec;
esp_afe_sr_iface_op_disable_se_t disable_se;
esp_afe_sr_iface_op_enable_se_t enable_se;
esp_afe_sr_iface_op_disable_func_t disable_wakenet;
esp_afe_sr_iface_op_enable_func_t enable_wakenet;
esp_afe_sr_iface_op_disable_func_t disable_aec;
esp_afe_sr_iface_op_enable_func_t enable_aec;
esp_afe_sr_iface_op_disable_func_t disable_se;
esp_afe_sr_iface_op_enable_func_t enable_se;
esp_afe_sr_iface_op_disable_func_t disable_vad;
esp_afe_sr_iface_op_enable_func_t enable_vad;
esp_afe_sr_iface_op_disable_func_t disable_ns;
esp_afe_sr_iface_op_enable_func_t enable_ns;
esp_afe_sr_iface_op_disable_func_t disable_agc;
esp_afe_sr_iface_op_enable_func_t enable_agc;
esp_afe_sr_iface_op_print_pipeline_t print_pipeline;
esp_afe_sr_iface_op_destroy_t destroy;
} esp_afe_sr_iface_t;
// struct is used to store the AFE handle and data for the AFE task
typedef struct
{
esp_afe_sr_data_t *afe_data;
esp_afe_sr_iface_t *afe_handle;
TaskHandle_t feed_task;
TaskHandle_t fetch_task;
}afe_task_into_t;
#ifdef __cplusplus
}
#endif

View File

@ -6,17 +6,7 @@ extern "C" {
#include "esp_afe_sr_iface.h"
#if CONFIG_AFE_INTERFACE_V1
extern const esp_afe_sr_iface_t esp_afe_sr_v1;
extern const esp_afe_sr_iface_t esp_afe_vc_v1;
#define ESP_AFE_SR_HANDLE esp_afe_sr_v1
#define ESP_AFE_VC_HANDLE esp_afe_vc_v1
#else
#error No valid afe selected.
#endif
esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
#ifdef __cplusplus
}

View File

@ -26,8 +26,15 @@ typedef enum {
ESP_AGC_FRAME_SIZE_ERROR = -3, ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size
} ESP_AGE_ERR;
typedef enum {
AGC_MODE_SR = -1, // Bypass WEBRTC AGC
AGC_MODE_0 = 0, // Only saturation protection
AGC_MODE_1 = 1, // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
AGC_MODE_2 = 2, // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
AGC_MODE_3 = 3, // Fixed Digital Gain [compressionGaindB (default 8 dB)]
} agc_mode_t;
void *esp_agc_open(int agc_mode, int sample_rate);
void *esp_agc_open(agc_mode_t agc_mode, int sample_rate);
void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs);
int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate);
void esp_agc_close(void *agc_handle);

View File

@ -78,6 +78,8 @@ vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
typedef struct {
vad_trigger_t *trigger;
void *vad_inst;
int sample_rate;
int frame_size;
}vad_handle_with_trigger_t;
typedef vad_handle_with_trigger_t* vad_handle_t;
@ -100,31 +102,41 @@ vad_handle_t vad_create(vad_mode_t vad_mode);
* @brief Creates an instance to the VAD structure.
*
* @param vad_mode Sets the VAD operating mode.
* @param min_speech_len Minimum frame number of speech duration
* @param min_noise_len Minimum frame number of noise duration
* @param sample_rate Sample rate in Hz
* @param one_frame_ms Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30.
* @param min_speech_ms Minimum speech duration, unit is ms
* @param min_noise_ms Minimum noise duration, unit is ms
* @return
* - NULL: Create failed
* - Others: The instance of VAD
*/
vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len);
vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len);
/**
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
*
* @param inst The instance of VAD.
*
* @param data An array of 16-bit signed audio samples.
*
* @param handle The instance of VAD.
* @param data An array of 16-bit signed audio samples.
* @param sample_rate_hz The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
*
* @param one_frame_ms The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
*
* @return
* - VAD_SILENCE if no voice
* - VAD_SPEECH if voice is detected
*
*/
vad_state_t vad_process(vad_handle_t inst, int16_t *data, int sample_rate_hz, int one_frame_ms);
vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms);
/**
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
*
* @param handle The instance of VAD.
* @param data An array of 16-bit signed audio samples.
* @return
* - VAD_SILENCE if no voice
* - VAD_SPEECH if voice is detected
*
*/
vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
/**
* @brief Free the VAD instance

View File

@ -1,6 +1,7 @@
#pragma once
#include "esp_vad.h"
#include "stdint.h"
#include "dl_lib_convq_queue.h"
#ifdef __cplusplus
extern "C" {
@ -98,6 +99,25 @@ typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model
*/
typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
/**
* @brief Feed MFCC of an audio stream to the vad model and detect whether is
* voice.
*
* @param model The model object to query
* @param cq An array of 16-bit MFCC.
* @return The index of wake words, return 0 if no wake word is detected, else
* the index of the wake words.
*/
typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq);
/**
* @brief Get MFCC of an audio stream
*
* @param model The model object to query
* @return MFCC data
*/
typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
/**
* @brief Get the triggered channel index. Channel index starts from zero
*
@ -133,6 +153,8 @@ typedef struct {
esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
esp_vadn_iface_op_detect_t detect;
esp_vadn_iface_op_detect_mfcc_t detect_mfcc;
esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data;
esp_vadn_iface_op_clean_t clean;
esp_vadn_iface_op_destroy_t destroy;
} esp_vadn_iface_t;

View File

@ -0,0 +1,90 @@
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License
#ifndef _ESP_WEBRTC_H_
#define _ESP_WEBRTC_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdint.h>
#include "sr_ringbuf.h"
#include "esp_log.h"
#include "esp_agc.h"
#include "esp_ns.h"
#include "esp_heap_caps.h"
typedef struct {
void* ns_handle;
void* agc_handle;
int frame_size;
int sample_rate;
int16_t *buff;
int16_t *out_data;
sr_ringbuf_handle_t rb;
}webrtc_handle_t;
/**
* @brief Creates an instance of webrtc.
*
* @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms.
*
* @param frame_length_ms The length of the audio processing
* @param ns_mode The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive
* @param agc_mode The model of AGC
* @param agc_gain The gain of AGC. default is 9
* @param agc_target_level The target level of AGC. default is -3 dbfs
* @param sample_rate The sample rate of the audio.
*
* @return
* - NULL: Create failed
* - Others: The instance of webrtc
*/
webrtc_handle_t* webrtc_create(
int frame_length_ms,
int ns_mode,
agc_mode_t agc_mode,
int agc_gain,
int agc_target_level,
int sample_rate);
/**
* @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression.
*
* @param handle The instance of NS.
* @param in_data An array of 16-bit signed audio samples.
* @param out_size The sample size of output data
* @param enable_ns Enable noise suppression
* @param enable_agc Enable automatic gain control
*
* @return data after noise suppression
*/
int16_t* webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc);
/**
* @brief Free the webrtc instance
*
* @param handle The instance of webrtc.
*
* @return None
*
*/
void webrtc_destroy(webrtc_handle_t *handle);
#ifdef __cplusplus
}
#endif
#endif //_ESP_NS_H_

View File

@ -1,5 +1,6 @@
#pragma once
#include "stdint.h"
#include "dl_lib_convq_queue.h"
#ifdef __cplusplus
extern "C" {
@ -167,6 +168,25 @@ typedef void (*esp_wn_iface_op_clean_t)(model_iface_data_t *model);
*/
typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model);
/**
* @brief Feed MFCC of an audio stream to the vad model and detect whether is
* voice.
*
* @param model The model object to query
* @param cq An array of 16-bit MFCC.
* @return The index of wake words, return 0 if no wake word is detected, else
* the index of the wake words.
*/
typedef wakenet_state_t (*esp_wn_iface_op_detect_mfcc_t)(model_iface_data_t *model, int16_t *samples, dl_convq_queue_t *cq);
/**
* @brief Get MFCC of an audio stream
*
* @param model The model object to query
* @return MFCC data
*/
typedef dl_convq_queue_t* (*esp_wn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
/**
* This structure contains the functions used to do operations on a wake word detection model.
@ -184,6 +204,8 @@ typedef struct {
esp_wn_iface_op_get_triggered_channel_t get_triggered_channel;
esp_wn_iface_op_get_vol_gain_t get_vol_gain;
esp_wn_iface_op_detect_t detect;
esp_wn_iface_op_detect_mfcc_t detect_mfcc;
esp_wn_iface_op_get_mfcc_data_t get_mfcc_data;
esp_wn_iface_op_clean_t clean;
esp_wn_iface_op_destroy_t destroy;
} esp_wn_iface_t;

View File

@ -21,80 +21,72 @@ extern "C" {
#endif
#define USE_AEC_FFT // Not kiss_fft
#define AEC_USE_SPIRAM 0
#define AEC_SAMPLE_RATE 16000 // Only Support 16000Hz
//#define AEC_FRAME_LENGTH_MS 16
#define AEC_FRAME_LENGTH_MS 32
#define AEC_FILTER_LENGTH 1200 // Number of samples of echo to cancel
typedef void* aec_handle_t;
typedef struct aec_handle_t aec_handle_t;
typedef enum {
AEC_MODE_SR_LOW_COST = 0, // Low Cost AEC fro speech recognition
AEC_MODE_SR_HIGH_PERF = 1, // High Perforamce AEC for speech recognition
AEC_MODE_VOIP_LOW_COST = 3, // Low Cost AEC for voice communication
AEC_MODE_VOIP_HIGH_PERF = 4, // High Perforamce AEC for voice communication
} aec_mode_t;
/**
* @brief Creates an instance to the AEC structure.
* Please get frame size by aec_get_chunksize() function
*
* @deprecated This API will be deprecated after version 1.0, please use aec_pro_create
*
* @param sample_rate The Sampling frequency (Hz) must be 16000.
*
* @param frame_length The length of the audio processing must be 16ms.
*
* @param filter_length Number of samples of echo to cancel.
*
* @param filter_length Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
* @param channel_num The input microphone channel number
* @param mode The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
* @return
* - NULL: Create failed
* - Others: The instance of AEC
*/
aec_handle_t aec_create(int sample_rate, int frame_length, int filter_length);
aec_handle_t *aec_create(int sample_rate, int filter_length, int channel_num, aec_mode_t mode);
/**
* @brief Creates an instance to the AEC structure.
* @brief Creates an instance to the AEC structure, same with aec_create().
*
* @deprecated This API will be deprecated after version 1.0, please use aec_pro_create
*
* @param sample_rate The Sampling frequency (Hz) must be 16000.
*
* @param frame_length The length of the audio processing must be 16ms.
*
* @param filter_length Number of samples of echo to cancel.
*
* @param nch Number of input signal channel.
*
* @param filter_length Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
* @param channel_num The input microphone channel number
* @param mode The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
* @return
* - NULL: Create failed
* - Others: The instance of AEC
*/
aec_handle_t aec_create_multimic(int sample_rate, int frame_length, int filter_length, int nch);
/**
* @brief Creates an instance of more powerful AEC.
*
* @param frame_length Length of input signal. Must be 16ms if mode is 0; otherwise could be 16ms or 32ms. Length of input signal to aec_process must be modified accordingly.
*
* @param nch Number of microphones.
*
* @param mode Mode of AEC (0 to 5), indicating aggressiveness and RAM allocation. 0: mild; 1 or 2: medium (1: internal RAM, 2: SPIRAM); 3 and 4: aggressive (3: internal RAM, 4: SPIRAM); 5: agressive, accelerated for ESP32-S3.
*
* @return
* - NULL: Create failed
* - Others: An Instance of AEC
*/
aec_handle_t aec_pro_create(int frame_length, int nch, int mode);
aec_handle_t *aec_pro_create(int filter_length, int channel_num, aec_mode_t mode);
/**
* @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
*
* @param inst The instance of AEC.
*
* @warning The indata, refdata and outdata must be 16-bit signed. please allocate memory by heap_caps_aligned_alloc().
*
* @param inst The instance of AEC. Format for multi-channel data is "ch0 ch0 ch0 ..., ch1 ch1 ch1 ..."
* @param indata An array of 16-bit signed audio samples from mic.
*
* @param refdata An array of 16-bit signed audio samples sent to the speaker.
*
* @param outdata Returns near-end signal with echo removed.
*
* @param outdata Returns near-end signal with echo removed. Format for multi-channel data is "ch0 ch0 ch0..., ch1 ch1 ch1 ..."
* @return None
*
*/
void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int16_t *outdata);
void aec_process(const aec_handle_t *handel, int16_t *indata, int16_t *refdata, int16_t *outdata);
/**
* @brief Get frame size of AEC (the samples of one frame)
* @param handle The instance of AEC.
* @return Frame size
*/
int aec_get_chunksize(const aec_handle_t *handle);
/**
* @brief Get AEC mode string
*
* @param aec_mode The mode of AEC.
*
* @return AEC mode string
*/
char * aec_get_mode_string(aec_mode_t aec_mode);
/**
* @brief Free the AEC instance
@ -104,7 +96,7 @@ void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int
* @return None
*
*/
void aec_destroy(aec_handle_t inst);
void aec_destroy(aec_handle_t *handel);
#ifdef __cplusplus
}

View File

@ -1,24 +1,41 @@
#pragma once
#include "stdint.h"
#include "stdbool.h"
#include "stdlib.h"
#include "esp_wn_iface.h"
#include "esp_wn_models.h"
#include "esp_vad.h"
#include "esp_aec.h"
#include "esp_agc.h"
#include "model_path.h"
#include "esp_vadn_models.h"
#include "esp_nsn_models.h"
#ifdef __cplusplus
extern "C" {
#endif
//AFE: Audio Front-End
//SR: Speech Recognition
//afe_sr/AFE_SR: the audio front-end for speech recognition
//VC: Voice Communication
//Set AFE_SR mode
typedef enum {
SR_MODE_LOW_COST = 0,
SR_MODE_HIGH_PERF = 1
SR_MODE_LOW_COST = 0, //Deprecated, please use afe_mode_t, AFE mode: low cost mode
SR_MODE_HIGH_PERF = 1, //Deprecated, please use afe_mode_t, AFE mode: high performance mode
} afe_sr_mode_t;
//Set AFE mode
typedef enum {
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
} afe_mode_t;
//Set AFE type
typedef enum {
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
} afe_type_t;
typedef enum {
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
@ -26,24 +43,30 @@ typedef enum {
} afe_memory_alloc_mode_t;
typedef enum {
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of audio fed to multinet is -9dB
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of audio fed to multinet is -6dB
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of audio fed to multinet is -3dB
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
} afe_mn_peak_agc_mode_t;
typedef struct {
int total_ch_num; // total channel num. It must be: total_ch_num = mic_num + ref_num
int mic_num; // mic channel num
int ref_num; // reference channel num
int sample_rate; // sample rate of audio
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
int mic_num; // microphone channel number
uint8_t* mic_ids; // microphone channel indices
int ref_num; // playback reference channel number
uint8_t* ref_ids; // playback reference channel indices
int sample_rate; // sample rate of audio
} afe_pcm_config_t;
typedef enum {
NS_MODE_SSP = 0, // speech signal process method
NS_MODE_NET = 1, // deep noise suppression net method
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
AFE_NS_MODE_NET = 1, // please use model name of NSNET
} afe_ns_mode_t;
typedef enum {
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
} afe_agc_mode_t;
/**
* @brief Function to get the debug audio data
@ -66,148 +89,192 @@ typedef struct {
} afe_debug_hook_t;
typedef struct {
bool aec_init;
bool se_init;
bool vad_init;
/********** AEC(Acoustic Echo Cancellation) **********/
bool aec_init; // Whether to init aec
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
int aec_filter_length; // The filter length of aec
/********** SE(Speech Enhancement, microphone array processing) **********/
bool se_init; // Whether to init se
/********** NS(Noise Suppression) **********/
bool ns_init; // Whether to init ns
char *ns_model_name; // Model name of ns
afe_ns_mode_t afe_ns_mode; // Model mode of ns
/********** VAD(Voice Activity Detection) **********/
bool vad_init; // Whether to init vad
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
/********** WakeNet(Wake Word Engine) **********/
bool wakenet_init;
bool voice_communication_init;
bool voice_communication_agc_init; // AGC swich for voice communication
int voice_communication_agc_gain; // AGC gain(dB) for voice communication
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
char *wakenet_model_name; // The model name of wakenet 1
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
det_mode_t wakenet_mode;
afe_sr_mode_t afe_mode;
int afe_perferred_core;
int afe_perferred_priority;
int afe_ringbuf_size;
afe_memory_alloc_mode_t memory_alloc_mode;
float afe_linear_gain; // The linear gain for sr output(note: invaild for vc), the value should be in [0.1, 10.0].
// This value acts directly on the output amplitude: out_linear_gain * amplitude.
afe_mn_peak_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
det_mode_t wakenet_mode; // The mode of wakenet
/********** AGC(Automatic Gain Control) **********/
bool agc_init; // Whether to init agc
afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
int agc_compression_gain_db; // Compression gain in dB (default 9)
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
/********** General AFE(Audio Front End) parameter **********/
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
afe_mode_t afe_mode; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
afe_type_t afe_type; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude.
bool debug_init;
afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX];
afe_ns_mode_t afe_ns_mode;
char *afe_ns_model_name;
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
// otherwise, select channel number by wakenet
char *vad_model_name; // The model name of vad, support vadnet1 and vadnet1_small
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms
int vad_min_noise_ms; // The minimum duration of noise/silence in ms. It should be bigger than 64 ms
bool vad_mute_playback; // If true, the playback will be muted for vad detection
} afe_config_t;
/**
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format.
* You can manually fine-tune it after creating the configuration
*
* The input format:
* M to represent the microphone channel
* R to represent the playback reference channel
* N to represent an unknown or unused channel
*
* For example, input_format="MMNR" indicates that the input data consists of four channels,
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
*
* @param input_format The input format
* @param models Models from partition, which is configured by Kconfig
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
*
* @return afe_config_t* The default config of afe
*/
afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
#if CONFIG_IDF_TARGET_ESP32
#define AFE_CONFIG_DEFAULT() { \
.aec_init = true, \
.se_init = true, \
.vad_init = true, \
.wakenet_init = true, \
.voice_communication_init = false, \
.voice_communication_agc_init = false, \
.voice_communication_agc_gain = 15, \
.vad_mode = VAD_MODE_0, \
.wakenet_model_name = NULL, \
.wakenet_model_name_2 = NULL, \
.wakenet_mode = DET_MODE_90, \
.afe_mode = SR_MODE_HIGH_PERF, \
.afe_perferred_core = 0, \
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.memory_alloc_mode = AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE, \
.afe_linear_gain = 1.0, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config = { \
.total_ch_num = 2, \
.mic_num = 1, \
.ref_num = 1, \
.sample_rate = 16000, \
}, \
.debug_init = false, \
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
.afe_ns_mode = NS_MODE_SSP, \
.afe_ns_model_name = NULL, \
.fixed_first_channel = true, \
.vad_model_name = NULL, \
.vad_min_speech_ms = 64, \
.vad_min_noise_ms = 256, \
.vad_mute_playback = false, \
}
#elif CONFIG_IDF_TARGET_ESP32P4
#define AFE_CONFIG_DEFAULT() { \
.aec_init = true, \
.se_init = true, \
.vad_init = true, \
.wakenet_init = true, \
.voice_communication_init = false, \
.voice_communication_agc_init = false, \
.voice_communication_agc_gain = 15, \
.vad_mode = VAD_MODE_0, \
.wakenet_model_name = NULL, \
.wakenet_model_name_2 = NULL, \
.wakenet_mode = DET_MODE_90, \
.afe_mode = SR_MODE_LOW_COST, \
.afe_perferred_core = 0, \
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
.afe_linear_gain = 1.0, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config = { \
.total_ch_num = 2, \
.mic_num = 1, \
.ref_num = 1, \
.sample_rate = 16000, \
}, \
.debug_init = false, \
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
.afe_ns_mode = NS_MODE_SSP, \
.afe_ns_model_name = NULL, \
.fixed_first_channel = true, \
.vad_model_name = NULL, \
.vad_min_speech_ms = 64, \
.vad_min_noise_ms = 256, \
.vad_mute_playback = false, \
}
#elif CONFIG_IDF_TARGET_ESP32S3
#define AFE_CONFIG_DEFAULT() { \
.aec_init = true, \
.se_init = true, \
.vad_init = true, \
.wakenet_init = true, \
.voice_communication_init = false, \
.voice_communication_agc_init = false, \
.voice_communication_agc_gain = 15, \
.vad_mode = VAD_MODE_0, \
.wakenet_model_name = NULL, \
.wakenet_model_name_2 = NULL, \
.wakenet_mode = DET_MODE_2CH_90, \
.afe_mode = SR_MODE_LOW_COST, \
.afe_perferred_core = 0, \
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
.afe_linear_gain = 1.0, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config = { \
.total_ch_num = 3, \
.mic_num = 2, \
.ref_num = 1, \
.sample_rate = 16000, \
}, \
.debug_init = false, \
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
.afe_ns_mode = NS_MODE_SSP, \
.afe_ns_model_name = NULL, \
.fixed_first_channel = true, \
.vad_model_name = NULL, \
.vad_min_speech_ms = 64, \
.vad_min_noise_ms = 256, \
.vad_mute_playback = false, \
}
#endif
/**
* @brief Check AFE configuration and make sure it is correct.
*
* @warning If there is a configuration conflict, this function will modify some parameters.
* The guiding behind these modifications is to maintain the highest performance of the output audio and results.
* And remove the conflict between different algorithms.
*
* For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
* If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
*
* @param afe_config Input AFE config
*
* @return afe_config_t* The modified AFE config
*/
afe_config_t *afe_config_check(afe_config_t *afe_config);
/**
* @brief Parse input format
*
* @param input_format The input format, same with afe_config_init() function
* @param pcm_config The pcm config
*
* @return true if the input format is parsed successfully, otherwise false
*/
bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config);
/**
* @brief Parse I2S input data
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param mic_data The output microphone data
* @param ref_data The output playback reference data
* @param pcm_config The pcm config
*
*/
void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config);
/**
* @brief Parse input data, from interleaved arrangement to contiguous arrangement
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param channel_num The channel number of data
* @param out_data The output data
*
*/
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
/**
* @brief Format input data, from contiguous arrangement to interleaved arrangement
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param channel_num The channel number of data
* @param out_data The output data
*
*/
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
/**
* @brief Adjust the gain of input data
*
* @warning the input data will be modified inplace.
*
* @param data The input audio data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param factor The gain factor
*
* @return int16_t* The output audio data
*/
int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
/**
* @brief Adjust the gain of input data
*
* @warning the input data will be modified inplace.
*
* @param in_data The input audio data
* @param in_frame_size Input data frame size of input
* @param channel_num The channel number of input data, which is same as output data
* @param out_data The output audio data
* @param out_frame_size Onput data frame size of input
*
*/
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size);
/**
* @brief Copy the afe config
*
* @param dst_config The destination afe config
* @param src_config The source afe config
*
* @return The destination afe config
*/
afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
/**
* @brief Print the afe config
*
* @param afe_config The afe config
*/
void afe_config_print(const afe_config_t *afe_config);
/**
* @brief Allocate afe config
*
* @return The afe config pointer
*/
afe_config_t *afe_config_alloc();
/**
* @brief Free afe config
*
* @param afe_config The afe config pointer
*/
void afe_config_free(afe_config_t *afe_config);
#ifdef __cplusplus
}

View File

@ -1,7 +1,10 @@
#pragma once
#include "stdint.h"
#include "stdlib.h"
#include "stdbool.h"
#include "esp_afe_config.h"
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#ifdef __cplusplus
extern "C" {
#endif
@ -13,13 +16,15 @@ extern "C" {
//Opaque AFE_SR data container
typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
/**
* @brief The state of vad
*/
typedef enum
{
AFE_VAD_SILENCE = 0, // noise or silence
AFE_VAD_SPEECH // speech
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
} afe_vad_state_t;
/**
@ -27,7 +32,7 @@ typedef enum
*/
typedef struct afe_fetch_result_t
{
int16_t *data; // the data of audio.
int16_t *data; // the target channel data of audio.
int data_size; // the size of data. The unit is byte.
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
int vad_cache_size; // the size of vad_cache. The unit is byte.
@ -36,10 +41,12 @@ typedef struct afe_fetch_result_t
wakenet_state_t wakeup_state; // the value is wakenet_state_t
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
afe_vad_state_t vad_state; // the value is afe_vad_state_t
vad_state_t vad_state; // the value is afe_vad_state_t
int trigger_channel_id; // the channel index of output
int wake_word_length; // the length of wake word. The unit is the number of samples.
int ret_value; // the return state of fetch function
int16_t *raw_data; // the multi-channel output data of audio.
int raw_data_channels; // the channel number of raw data
void* reserved; // reserved for future use
} afe_fetch_result_t;
@ -63,19 +70,11 @@ typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_confi
typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
/**
* @brief Get the total channel number which be config
* @brief Get the channel number
*
* @param afe The AFE_SR object to query
* @return The amount of total channels
*/
typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe);
/**
* @brief Get the mic channel number which be config
*
* @param afe The AFE_SR object to query
* @return The amount of mic channels
*/
typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
/**
@ -104,12 +103,24 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t*
* @brief fetch enhanced samples of an audio stream from the AFE_SR
*
* @Warning The output is single channel data, no matter how many channels the input is.
* Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
*
* @param afe The AFE_SR object to query
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
*/
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
/**
* @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
*
* @Warning The output is single channel data, no matter how many channels the input is.
*
* @param afe The AFE_SR object to query
* @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result.
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
*/
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
/**
* @brief reset ringbuf of AFE.
*
@ -129,52 +140,37 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name);
/**
* @brief Disable wakenet model.
* @brief Enable VAD algorithm.
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_disable_wakenet_t)(esp_afe_sr_data_t *afe);
typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe);
/**
* @brief Enable wakenet model.
* @brief Disable one function/module/algorithm.
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_enable_wakenet_t)(esp_afe_sr_data_t *afe);
typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe);
/**
* @brief Disable AEC algorithm.
* @brief Enable one function/module/algorithm.
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_disable_aec_t)(esp_afe_sr_data_t *afe);
typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe);
/**
* @brief Enable AEC algorithm.
* @brief Print all functions/modules/algorithms pipeline.
* The pipeline is the order of the functions/modules/algorithms.
* The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output]
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_enable_aec_t)(esp_afe_sr_data_t *afe);
/**
* @brief Disable SE algorithm.
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_disable_se_t)(esp_afe_sr_data_t *afe);
/**
* @brief Enable SE algorithm.
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
*/
typedef int (*esp_afe_sr_iface_op_enable_se_t)(esp_afe_sr_data_t *afe);
typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
/**
* @brief Destroy a AFE_SR instance
@ -191,22 +187,41 @@ typedef struct {
esp_afe_sr_iface_op_create_from_config_t create_from_config;
esp_afe_sr_iface_op_feed_t feed;
esp_afe_sr_iface_op_fetch_t fetch;
esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay;
esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
esp_afe_sr_iface_op_get_total_channel_num_t get_total_channel_num;
esp_afe_sr_iface_op_get_channel_num_t get_channel_num;
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
esp_afe_sr_iface_op_disable_wakenet_t disable_wakenet;
esp_afe_sr_iface_op_enable_wakenet_t enable_wakenet;
esp_afe_sr_iface_op_disable_aec_t disable_aec;
esp_afe_sr_iface_op_enable_aec_t enable_aec;
esp_afe_sr_iface_op_disable_se_t disable_se;
esp_afe_sr_iface_op_enable_se_t enable_se;
esp_afe_sr_iface_op_disable_func_t disable_wakenet;
esp_afe_sr_iface_op_enable_func_t enable_wakenet;
esp_afe_sr_iface_op_disable_func_t disable_aec;
esp_afe_sr_iface_op_enable_func_t enable_aec;
esp_afe_sr_iface_op_disable_func_t disable_se;
esp_afe_sr_iface_op_enable_func_t enable_se;
esp_afe_sr_iface_op_disable_func_t disable_vad;
esp_afe_sr_iface_op_enable_func_t enable_vad;
esp_afe_sr_iface_op_disable_func_t disable_ns;
esp_afe_sr_iface_op_enable_func_t enable_ns;
esp_afe_sr_iface_op_disable_func_t disable_agc;
esp_afe_sr_iface_op_enable_func_t enable_agc;
esp_afe_sr_iface_op_print_pipeline_t print_pipeline;
esp_afe_sr_iface_op_destroy_t destroy;
} esp_afe_sr_iface_t;
// struct is used to store the AFE handle and data for the AFE task
typedef struct
{
esp_afe_sr_data_t *afe_data;
esp_afe_sr_iface_t *afe_handle;
TaskHandle_t feed_task;
TaskHandle_t fetch_task;
}afe_task_into_t;
#ifdef __cplusplus
}
#endif

View File

@ -6,17 +6,7 @@ extern "C" {
#include "esp_afe_sr_iface.h"
#if CONFIG_AFE_INTERFACE_V1
extern const esp_afe_sr_iface_t esp_afe_sr_v1;
extern const esp_afe_sr_iface_t esp_afe_vc_v1;
#define ESP_AFE_SR_HANDLE esp_afe_sr_v1
#define ESP_AFE_VC_HANDLE esp_afe_vc_v1
#else
#error No valid afe selected.
#endif
esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
#ifdef __cplusplus
}

View File

@ -26,8 +26,15 @@ typedef enum {
ESP_AGC_FRAME_SIZE_ERROR = -3, ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size
} ESP_AGE_ERR;
typedef enum {
AGC_MODE_SR = -1, // Bypass WEBRTC AGC
AGC_MODE_0 = 0, // Only saturation protection
AGC_MODE_1 = 1, // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
AGC_MODE_2 = 2, // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
AGC_MODE_3 = 3, // Fixed Digital Gain [compressionGaindB (default 8 dB)]
} agc_mode_t;
void *esp_agc_open(int agc_mode, int sample_rate);
void *esp_agc_open(agc_mode_t agc_mode, int sample_rate);
void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs);
int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate);
void esp_agc_close(void *agc_handle);

View File

@ -78,6 +78,8 @@ vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
typedef struct {
vad_trigger_t *trigger;
void *vad_inst;
int sample_rate;
int frame_size;
}vad_handle_with_trigger_t;
typedef vad_handle_with_trigger_t* vad_handle_t;
@ -100,31 +102,41 @@ vad_handle_t vad_create(vad_mode_t vad_mode);
* @brief Creates an instance to the VAD structure.
*
* @param vad_mode Sets the VAD operating mode.
* @param min_speech_len Minimum frame number of speech duration
* @param min_noise_len Minimum frame number of noise duration
* @param sample_rate Sample rate in Hz
* @param one_frame_ms Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30.
* @param min_speech_ms Minimum speech duration, unit is ms
* @param min_noise_ms Minimum noise duration, unit is ms
* @return
* - NULL: Create failed
* - Others: The instance of VAD
*/
vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len);
vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len);
/**
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
*
* @param inst The instance of VAD.
*
* @param data An array of 16-bit signed audio samples.
*
* @param handle The instance of VAD.
* @param data An array of 16-bit signed audio samples.
* @param sample_rate_hz The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
*
* @param one_frame_ms The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
*
* @return
* - VAD_SILENCE if no voice
* - VAD_SPEECH if voice is detected
*
*/
vad_state_t vad_process(vad_handle_t inst, int16_t *data, int sample_rate_hz, int one_frame_ms);
vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms);
/**
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
*
* @param handle The instance of VAD.
* @param data An array of 16-bit signed audio samples.
* @return
* - VAD_SILENCE if no voice
* - VAD_SPEECH if voice is detected
*
*/
vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
/**
* @brief Free the VAD instance

View File

@ -1,6 +1,7 @@
#pragma once
#include "esp_vad.h"
#include "stdint.h"
#include "dl_lib_convq_queue.h"
#ifdef __cplusplus
extern "C" {
@ -98,6 +99,25 @@ typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model
*/
typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
/**
* @brief Feed MFCC of an audio stream to the vad model and detect whether is
* voice.
*
* @param model The model object to query
* @param cq An array of 16-bit MFCC.
* @return The index of wake words, return 0 if no wake word is detected, else
* the index of the wake words.
*/
typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq);
/**
* @brief Get MFCC of an audio stream
*
* @param model The model object to query
* @return MFCC data
*/
typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
/**
* @brief Get the triggered channel index. Channel index starts from zero
*
@ -133,6 +153,8 @@ typedef struct {
esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
esp_vadn_iface_op_detect_t detect;
esp_vadn_iface_op_detect_mfcc_t detect_mfcc;
esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data;
esp_vadn_iface_op_clean_t clean;
esp_vadn_iface_op_destroy_t destroy;
} esp_vadn_iface_t;

View File

@ -0,0 +1,90 @@
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License
#ifndef _ESP_WEBRTC_H_
#define _ESP_WEBRTC_H_
#ifdef __cplusplus
extern "C" {
#endif
#include <stdint.h>
#include "sr_ringbuf.h"
#include "esp_log.h"
#include "esp_agc.h"
#include "esp_ns.h"
#include "esp_heap_caps.h"
typedef struct {
void* ns_handle;
void* agc_handle;
int frame_size;
int sample_rate;
int16_t *buff;
int16_t *out_data;
sr_ringbuf_handle_t rb;
}webrtc_handle_t;
/**
* @brief Creates an instance of webrtc.
*
* @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms.
*
* @param frame_length_ms The length of the audio processing
* @param ns_mode The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive
* @param agc_mode The model of AGC
* @param agc_gain The gain of AGC. default is 9
* @param agc_target_level The target level of AGC. default is -3 dbfs
* @param sample_rate The sample rate of the audio.
*
* @return
* - NULL: Create failed
* - Others: The instance of webrtc
*/
webrtc_handle_t* webrtc_create(
int frame_length_ms,
int ns_mode,
agc_mode_t agc_mode,
int agc_gain,
int agc_target_level,
int sample_rate);
/**
* @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression.
*
* @param handle The instance of NS.
* @param in_data An array of 16-bit signed audio samples.
* @param out_size The sample size of output data
* @param enable_ns Enable noise suppression
* @param enable_agc Enable automatic gain control
*
* @return data after noise suppression
*/
int16_t* webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc);
/**
* @brief Free the webrtc instance
*
* @param handle The instance of webrtc.
*
* @return None
*
*/
void webrtc_destroy(webrtc_handle_t *handle);
#ifdef __cplusplus
}
#endif
#endif //_ESP_NS_H_

View File

@ -1,5 +1,6 @@
#pragma once
#include "stdint.h"
#include "dl_lib_convq_queue.h"
#ifdef __cplusplus
extern "C" {
@ -167,6 +168,25 @@ typedef void (*esp_wn_iface_op_clean_t)(model_iface_data_t *model);
*/
typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model);
/**
* @brief Feed MFCC of an audio stream to the vad model and detect whether is
* voice.
*
* @param model The model object to query
* @param cq An array of 16-bit MFCC.
* @return The index of wake words, return 0 if no wake word is detected, else
* the index of the wake words.
*/
typedef wakenet_state_t (*esp_wn_iface_op_detect_mfcc_t)(model_iface_data_t *model, int16_t *samples, dl_convq_queue_t *cq);
/**
* @brief Get MFCC of an audio stream
*
* @param model The model object to query
* @return MFCC data
*/
typedef dl_convq_queue_t* (*esp_wn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
/**
* This structure contains the functions used to do operations on a wake word detection model.
@ -184,6 +204,8 @@ typedef struct {
esp_wn_iface_op_get_triggered_channel_t get_triggered_channel;
esp_wn_iface_op_get_vol_gain_t get_vol_gain;
esp_wn_iface_op_detect_t detect;
esp_wn_iface_op_detect_mfcc_t detect_mfcc;
esp_wn_iface_op_get_mfcc_data_t get_mfcc_data;
esp_wn_iface_op_clean_t clean;
esp_wn_iface_op_destroy_t destroy;
} esp_wn_iface_t;

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
lib/esp32/libfst.a Normal file

Binary file not shown.

BIN
lib/esp32/libhufzip.a Normal file

Binary file not shown.

Binary file not shown.

BIN
lib/esp32/libnsnet.a Normal file

Binary file not shown.

BIN
lib/esp32/libvadnet.a Normal file

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1 +1 @@
vadnet1_mediumv1_Speech_3_0.5_0.1
vadnet1_mediumv1_Speech_1_0.5_0.1

View File

@ -958,4 +958,4 @@ end:
esp_mn_commands_print();
return esp_mn_commands_update();
}
}

View File

@ -8,7 +8,7 @@ set(srcs
idf_component_register(SRCS ${srcs}
INCLUDE_DIRS "." "samples"
REQUIRES unity esp-sr
REQUIRES unity esp-sr esp_timer
WHOLE_ARCHIVE)
target_compile_options(${COMPONENT_LIB} PRIVATE "-Wno-format")

View File

@ -12,7 +12,7 @@
#include <limits.h>
#include "unity.h"
#include "esp_log.h"
#include "esp_timer.h"
#include "model_path.h"
#include "esp_wn_iface.h"
#include "esp_wn_models.h"
@ -23,152 +23,187 @@
#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
#include "esp_nsn_models.h"
#include "esp_nsn_iface.h"
#include "esp_vadn_models.h"
#include "esp_vadn_iface.h"
#endif
#define ARRAY_SIZE_OFFSET 8 // Increase this if audio_sys_get_real_time_stats returns ESP_ERR_INVALID_SIZE
#define AUDIO_SYS_TASKS_ELAPSED_TIME_MS 1000 // Period of stats measurement
static const char *TAG = "AFE_TEST";
static volatile int s_cpu_test_task_flag = 0;
static esp_afe_sr_data_t *afe_data = NULL;
static int total_ram_size_before = 0;
static int internal_ram_size_before = 0;
static int psram_size_before = 0;
static int detect_cnt = 0;
static int fetch_task_flag = 0;
#if (CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID && CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS)
const static char *task_state[] = {
"Running",
"Ready",
"Blocked",
"Suspended",
"Deleted"
};
/** @brief
* "Extr": Allocated task stack from psram, "Intr": Allocated task stack from internel
*/
const static char *task_stack[] = {"Extr", "Intr"};
#endif
TEST_CASE(">>>>>>>> audio_front_end SR create/destroy API & memory leak <<<<<<<<", "[afe_sr]")
void test_afe_by_config(afe_config_t *afe_config, int frame_num, int* memory, float* cpu, int idx)
{
int audio_chunksize = 0;
int16_t *feed_buff = NULL;
int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
int start_internal_size = heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
int first_end_size = 0;
int end_size = 0;
int mem_leak = 0;
uint32_t feed_cpu_time = 0;
uint32_t fetch_cpu_time = 0;
uint32_t start=0, end = 0;
int loop = 3;
int feed_chunksize = 0;
int create_size = 0;
int create_internal_size = 0;
for (int aec_init = 0; aec_init < 2; aec_init++) {
for (int se_init = 0; se_init < 2; se_init++) {
for (int vad_init = 0; vad_init < 2; vad_init++) {
for (int wakenet_init = 0; wakenet_init < 2; wakenet_init++) {
printf("aec_init: %d, se_init: %d, vad_init: %d, wakenet_init: %d\n", aec_init, se_init, vad_init, wakenet_init);
for (int i=0; i<loop; i++) {
// init config and handle
esp_afe_sr_iface_t *afe_handle = esp_afe_handle_from_config(afe_config);
// afe_config_print(afe_config);
esp_afe_sr_data_t *afe_data = afe_handle->create_from_config(afe_config);
int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
int start_internal_size = heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
srmodel_list_t *models = esp_srmodel_init("model");
char *model_name = esp_srmodel_filter(models, ESP_WN_PREFIX, NULL);
char *vad_model_name = NULL;
#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
vad_model_name = esp_srmodel_filter(models, ESP_VADN_PREFIX, NULL);
#endif
create_size = start_size - heap_caps_get_free_size(MALLOC_CAP_8BIT);
create_internal_size = start_internal_size - heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)&ESP_AFE_SR_HANDLE;
afe_config_t afe_config = AFE_CONFIG_DEFAULT();
afe_config.aec_init = aec_init;
afe_config.se_init = se_init;
afe_config.vad_init = vad_init;
afe_config.wakenet_init = wakenet_init;
afe_config.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM;
afe_config.wakenet_model_name = model_name;
afe_config.voice_communication_init = false;
afe_config.vad_model_name = vad_model_name;
if (vad_model_name) {
printf("vad_model_name:%s\n", vad_model_name);
}
// run afe feed
feed_chunksize = afe_handle->get_feed_chunksize(afe_data);
int feed_nch = afe_handle->get_feed_channel_num(afe_data);
// test model loading time
struct timeval tv_start, tv_end;
gettimeofday(&tv_start, NULL);
afe_data = afe_handle->create_from_config(&afe_config);
gettimeofday(&tv_end, NULL);
int tv_ms = (tv_end.tv_sec - tv_start.tv_sec) * 1000 + (tv_end.tv_usec - tv_start.tv_usec) / 1000;
printf("create latency:%d ms\n", tv_ms);
int16_t *feed_buff = (int16_t *) malloc(feed_chunksize * sizeof(int16_t) * feed_nch);
start = esp_timer_get_time();
for (int j=0; j<frame_num; j++) {
afe_handle->feed(afe_data, feed_buff);
}
end = esp_timer_get_time();
feed_cpu_time += end - start;
// test model memory concumption
int create_size = start_size - heap_caps_get_free_size(MALLOC_CAP_8BIT);
int create_internal_size = start_internal_size - heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
printf("Internal RAM: %d, PSRAM:%d\n", create_internal_size, create_size - create_internal_size);
afe_handle->destroy(afe_data);
esp_srmodel_deinit(models);
//run afe fetch
start = esp_timer_get_time();
while(1) {
afe_fetch_result_t *res = afe_handle->fetch_with_delay(afe_data, 1 / portTICK_PERIOD_MS);
if (res->ret_value != ESP_OK) {
break;
}
}
end = esp_timer_get_time();
fetch_cpu_time += end - start;
free(feed_buff);
afe_handle->destroy(afe_data);
end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
// test memory leak
int first_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
int last_end_size = first_end_size;
int mem_leak = start_size - last_end_size;
printf("create&destroy times:%d, memory leak:%d\n", 1, mem_leak);
if (i==0) {
first_end_size = end_size;
}
mem_leak = start_size - end_size;
ESP_LOGI(TAG, "create&destroy times:%d, memory leak:%d\n", i, mem_leak);
}
uint32_t feed_data_time = loop * frame_num * feed_chunksize / 16 * 1000; // us
memory[idx*2] = create_internal_size;
memory[idx*2+1] = create_size - create_internal_size;
cpu[idx*2] = feed_cpu_time*1.0/feed_data_time;
cpu[idx*2+1] = fetch_cpu_time*1.0/feed_data_time;
printf("Internal RAM: %d, PSRAM:%d, feed cpu loading:%f, fetch cpu loading:%f\n",
memory[idx*2], memory[idx*2+1], cpu[idx*2], cpu[idx*2+1]);
TEST_ASSERT_EQUAL(true, mem_leak < 1000 && end_size == first_end_size);
}
for (int i = 0; i < 6; i++) {
printf("init partition ...\n");
models = esp_srmodel_init("model");
model_name = esp_srmodel_filter(models, ESP_WN_PREFIX, NULL);
#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
vad_model_name = esp_srmodel_filter(models, ESP_VADN_PREFIX, NULL);
#endif
afe_config.wakenet_model_name = model_name;
afe_config.vad_model_name = vad_model_name;
TEST_CASE(">>>>>>>> AFE create/destroy API & memory leak <<<<<<<<", "[afe]")
{
const char *input_format[6] = {"MR", "MMNR"};
afe_type_t afe_type[2] = {AFE_TYPE_SR, AFE_TYPE_VC};
afe_mode_t afe_mode[2] = {AFE_MODE_LOW_COST, AFE_MODE_HIGH_PERF};
int count = 0;
int memory[512];
float cpu[512];
printf("create ...\n");
afe_data = afe_handle->create_from_config(&afe_config);
audio_chunksize = afe_handle->get_feed_chunksize(afe_data);
feed_buff = (int16_t *) malloc(audio_chunksize * sizeof(int16_t) * afe_config.pcm_config.total_ch_num);
assert(feed_buff);
afe_handle->feed(afe_data, feed_buff);
printf("destroy ...\n");
afe_handle->destroy(afe_data);
afe_data = NULL;
if (feed_buff) {
free(feed_buff);
feed_buff = NULL;
// test all setting
srmodel_list_t *models = esp_srmodel_init("model");
for (int format_id=0; format_id<2; format_id++) {
for (int type_id=0; type_id<2; type_id++) {
for (int mode_id=0; mode_id<2; mode_id++) {
for (int aec_init = 0; aec_init < 2; aec_init++) {
for (int se_init = 0; se_init < 2; se_init++) {
for (int ns_init = 0; ns_init < 2; ns_init++) {
for (int vad_init = 0; vad_init < 2; vad_init++) {
for (int wakenet_init = 0; wakenet_init < 2; wakenet_init++) {
printf("format: %s, type: %d, mode: %d, memory size:%d %d\n",
input_format[format_id], afe_type[type_id], afe_mode[mode_id], heap_caps_get_free_size(MALLOC_CAP_8BIT), count);
afe_config_t *afe_config = afe_config_init(input_format[format_id], models, afe_type[type_id], afe_mode[mode_id]);
afe_config->aec_init = aec_init;
afe_config->se_init = se_init;
afe_config->ns_init = ns_init;
afe_config->vad_init = vad_init;
afe_config->wakenet_init = wakenet_init;
test_afe_by_config(afe_config, 4, memory, cpu, count);
afe_config_free(afe_config);
count++;
}
}
}
esp_srmodel_deinit(models);
vTaskDelay(100 / portTICK_PERIOD_MS);
last_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
mem_leak = start_size - last_end_size;
printf("create&destroy times:%d, memory leak:%d\n", i + 2, mem_leak);
}
TEST_ASSERT_EQUAL(true, (mem_leak) < 1000 && last_end_size == first_end_size);
}
}
}
}
for (int idx=0; idx<256; idx++) {
printf("Internal RAM: %d, PSRAM:%d, feed cpu loading:%f, fetch cpu loading:%f\n",
memory[idx*2], memory[idx*2+1], cpu[idx*2], cpu[idx*2+1]);
}
printf("AFE create/destroy API & memory leak test done\n");
}
TEST_CASE(">>>>>>>> AFE default setting <<<<<<<<", "[afe_benchmark]")
{
const char *input_format[6] = {"MR", "MMNR"};
afe_type_t afe_type[2] = {AFE_TYPE_SR, AFE_TYPE_VC};
afe_mode_t afe_mode[2] = {AFE_MODE_LOW_COST, AFE_MODE_HIGH_PERF};
int count = 0;
int memory[16];
float cpu[16];
// test all setting
srmodel_list_t *models = esp_srmodel_init("model");
for (int format_id=0; format_id<2; format_id++) {
for (int type_id=0; type_id<2; type_id++) {
for (int mode_id=0; mode_id<2; mode_id++) {
printf("format: %s, type: %d, mode: %d, memory size:%d %d\n",
input_format[format_id], afe_type[type_id], afe_mode[mode_id], heap_caps_get_free_size(MALLOC_CAP_8BIT), count);
afe_config_t *afe_config = afe_config_init(input_format[format_id], models, afe_type[type_id], afe_mode[mode_id]);
test_afe_by_config(afe_config, 8, memory, cpu, count);
afe_config_free(afe_config);
count++;
}
}
}
count = 0;
for (int format_id=0; format_id<2; format_id++) {
for (int type_id=0; type_id<2; type_id++) {
for (int mode_id=0; mode_id<2; mode_id++) {
printf("--------format: %s, type: %s, mode: %s------------\n", input_format[format_id], type_id==0? "SR": "VC", mode_id==0? "LOW_COST": "HIGH_PERF");
printf("Internal RAM: %d, PSRAM:%d, feed cpu loading:%f, fetch cpu loading:%f\n",
memory[count*2], memory[count*2+1], cpu[count*2], cpu[count*2+1]);
count++;
}
}
}
printf("test done\n");
}
void test_feed_Task(void *arg)
{
int sample_per_ms = 16;
// esp_afe_sr_iface_t *afe_handle = &ESP_AFE_SR_HANDLE;
esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)arg;
afe_task_into_t *afe_task_info = (afe_task_into_t *)arg;
esp_afe_sr_iface_t *afe_handle = afe_task_info->afe_handle;
esp_afe_sr_data_t *afe_data = afe_task_info->afe_data;
int feed_chunksize = afe_handle->get_feed_chunksize(afe_data);
int total_nch = afe_handle->get_total_channel_num(afe_data);
int16_t *i2s_buff = (int16_t *) malloc(feed_chunksize * sizeof(int16_t) * total_nch);
int feed_nch = afe_handle->get_feed_channel_num(afe_data);
int sample_per_ms = afe_handle->get_samp_rate(afe_data) / 1000;
int16_t *i2s_buff = (int16_t *) malloc(feed_chunksize * sizeof(int16_t) * feed_nch);
assert(i2s_buff);
ESP_LOGI(TAG, "feed task start\n");
// FILE *fp = fopen("/sdcard/out", "w");
// if (fp == NULL) printf("can not open file\n");
while (s_cpu_test_task_flag) {
// FatfsComboWrite(i2s_buff, audio_chunksize * I2S_CHANNEL_NUM * sizeof(int16_t), 1, fp);
int count = 0;
while (1) {
count ++;
afe_handle->feed(afe_data, i2s_buff);
vTaskDelay((feed_chunksize / sample_per_ms) / portTICK_PERIOD_MS);
if (count > 100) {
break;
}
}
if (i2s_buff) {
free(i2s_buff);
@ -177,346 +212,89 @@ void test_feed_Task(void *arg)
vTaskDelete(NULL);
}
void test_detect_Task(void *arg)
void test_fetch_Task(void *arg)
{
// esp_afe_sr_iface_t *afe_handle = &ESP_AFE_SR_HANDLE;
esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)arg;
int fetch_chunksize = afe_handle->get_fetch_chunksize(afe_data);
int16_t *buff = (int16_t *) malloc(fetch_chunksize * sizeof(int16_t));
assert(buff);
ESP_LOGI(TAG, "------------detect start------------\n");
// FILE *fp = fopen("/sdcard/out1", "w");
// if (fp == NULL) printf("can not open file\n");
while (s_cpu_test_task_flag) {
afe_task_into_t *afe_task_info = (afe_task_into_t *)arg;
esp_afe_sr_iface_t *afe_handle = afe_task_info->afe_handle;
esp_afe_sr_data_t *afe_data = afe_task_info->afe_data;
detect_cnt = 0;
fetch_task_flag = 1;
while (1) {
afe_fetch_result_t* res = afe_handle->fetch(afe_data);
if (!res || res->ret_value == ESP_FAIL) {
printf("fetch error!\n");
break;
}
if (res->wakeup_state == WAKENET_DETECTED) {
ESP_LOGI(TAG, "wakeword detected\n");
}
if (res->wakeup_state == WAKENET_CHANNEL_VERIFIED) {
ESP_LOGI(TAG, "AFE_FETCH_CHANNEL_VERIFIED\n");
detect_cnt++;
}
}
if (buff) {
free(buff);
}
// TEST_ASSERT_EQUAL(true, detect_cnt > 0);
ESP_LOGI(TAG, "detect task quit\n");
fetch_task_flag = 0;
vTaskDelete(NULL);
}
esp_err_t audio_sys_get_real_time_stats(void)
TEST_CASE("afe performance test (1ch)", "[afe_perf]")
{
#if (CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID && CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS)
TaskStatus_t *start_array = NULL, *end_array = NULL;
UBaseType_t start_array_size, end_array_size;
uint32_t start_run_time, end_run_time;
uint32_t task_elapsed_time, percentage_time;
esp_err_t ret;
const char *input_format = "MR";
afe_type_t afe_type = AFE_TYPE_VC;
afe_mode_t afe_model[2] = {AFE_MODE_HIGH_PERF, AFE_MODE_LOW_COST};
// Allocate array to store current task states
start_array_size = uxTaskGetNumberOfTasks() + ARRAY_SIZE_OFFSET;
start_array = (TaskStatus_t*) malloc(sizeof(TaskStatus_t) * start_array_size);
assert(start_array);
// Get current task states
start_array_size = uxTaskGetSystemState(start_array, start_array_size, &start_run_time);
if (start_array_size == 0) {
ESP_LOGE(TAG, "Insufficient array size for uxTaskGetSystemState. Trying increasing ARRAY_SIZE_OFFSET");
ret = ESP_FAIL;
if (start_array) {
free(start_array);
start_array = NULL;
}
if (end_array) {
free(end_array);
end_array = NULL;
}
return ret;
}
// test all setting
srmodel_list_t *models = esp_srmodel_init("model");
vTaskDelay(pdMS_TO_TICKS(AUDIO_SYS_TASKS_ELAPSED_TIME_MS));
// Allocate array to store tasks states post delay
end_array_size = uxTaskGetNumberOfTasks() + ARRAY_SIZE_OFFSET;
end_array = (TaskStatus_t*) malloc(sizeof(TaskStatus_t) * end_array_size);
assert(end_array);
// Get post delay task states
end_array_size = uxTaskGetSystemState(end_array, end_array_size, &end_run_time);
if (end_array_size == 0) {
ESP_LOGE(TAG, "Insufficient array size for uxTaskGetSystemState. Trying increasing ARRAY_SIZE_OFFSET");
ret = ESP_FAIL;
if (start_array) {
free(start_array);
start_array = NULL;
}
if (end_array) {
free(end_array);
end_array = NULL;
}
return ret;
}
// Calculate total_elapsed_time in units of run time stats clock period.
uint32_t total_elapsed_time = (end_run_time - start_run_time);
if (total_elapsed_time == 0) {
ESP_LOGE(TAG, "Delay duration too short. Trying increasing AUDIO_SYS_TASKS_ELAPSED_TIME_MS");
ret = ESP_FAIL;
if (start_array) {
free(start_array);
start_array = NULL;
}
if (end_array) {
free(end_array);
end_array = NULL;
}
return ret;
}
ESP_LOGI(TAG, "| Task | Run Time | Per | Prio | HWM | State | CoreId | Stack ");
// Match each task in start_array to those in the end_array
for (int i = 0; i < start_array_size; i++) {
for (int j = 0; j < end_array_size; j++) {
if (start_array[i].xHandle == end_array[j].xHandle) {
task_elapsed_time = end_array[j].ulRunTimeCounter - start_array[i].ulRunTimeCounter;
percentage_time = (task_elapsed_time * 100UL) / (total_elapsed_time * portNUM_PROCESSORS);
ESP_LOGI(TAG, "| %-17s | %-11d |%2d%% | %-4u | %-9u | %-7s | %-8x | %s",
start_array[i].pcTaskName, task_elapsed_time, percentage_time, start_array[i].uxCurrentPriority,
start_array[i].usStackHighWaterMark, task_state[(start_array[i].eCurrentState)],
start_array[i].xCoreID, task_stack[esp_ptr_internal(pxTaskGetStackStart(start_array[i].xHandle))]);
// Mark that task have been matched by overwriting their handles
start_array[i].xHandle = NULL;
end_array[j].xHandle = NULL;
break;
for (int mode_id=0; mode_id<2; mode_id++) {
afe_config_t *afe_config = afe_config_init(input_format, models, afe_type, afe_model[mode_id]);
if (afe_config->wakenet_init && afe_config->wakenet_model_name) {
esp_afe_sr_iface_t *afe_handle = esp_afe_handle_from_config(afe_config);
esp_afe_sr_data_t *afe_data = afe_handle->create_from_config(afe_config);
afe_task_into_t task_info;
task_info.afe_data = afe_data;
task_info.afe_handle = afe_handle;
task_info.feed_task = NULL;
task_info.fetch_task = NULL;
fetch_task_flag = 1;
xTaskCreatePinnedToCore(test_feed_Task, "feed_task", 8 * 1024, (void *)(&task_info), 5, &task_info.feed_task, 0);
xTaskCreatePinnedToCore(test_fetch_Task, "fetch_task", 8 * 1024, (void *)(&task_info), 5, &task_info.fetch_task, 0);
while (fetch_task_flag) {
vTaskDelay(32 / portTICK_PERIOD_MS);
}
}
afe_config_free(afe_config);
}
// Print unmatched tasks
for (int i = 0; i < start_array_size; i++) {
if (start_array[i].xHandle != NULL) {
ESP_LOGI(TAG, "| %s | Deleted", start_array[i].pcTaskName);
}
}
for (int i = 0; i < end_array_size; i++) {
if (end_array[i].xHandle != NULL) {
ESP_LOGI(TAG, "| %s | Created", end_array[i].pcTaskName);
}
}
printf("\n");
ret = ESP_OK;
return ret;
#else
ESP_LOGW(TAG, "Please enbale `CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID` and `CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS` in menuconfig");
return ESP_FAIL;
#endif
}
void test_print_cpuloading(void *arg)
{
while (s_cpu_test_task_flag) {
audio_sys_get_real_time_stats();
int total_ram_size_after = heap_caps_get_free_size(MALLOC_CAP_8BIT);
int internal_ram_size_after = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
int psram_size_after = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
ESP_LOGI(TAG, "total ram consume: %d KB", (total_ram_size_before - total_ram_size_after)/1024);
ESP_LOGI(TAG, "internal ram consume: %d KB", (internal_ram_size_before - internal_ram_size_after)/1024);
ESP_LOGI(TAG, "psram consume: %d KB\n\n", (psram_size_before - psram_size_after)/1024);
}
vTaskDelete(NULL);
}
TEST_CASE("audio_front_end SR cpu loading and memory info", "[afe_sr]")
{
srmodel_list_t *models = esp_srmodel_init("model");
if (models!=NULL) {
for (int i=0; i < models->num; i++) {
printf("Load: %s\n", models->model_name[i]);
}
}
char *wn_name = esp_srmodel_filter(models, ESP_WN_PREFIX, NULL);
printf("wn_name: %s\n", wn_name);
total_ram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT);
internal_ram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
psram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)&ESP_AFE_SR_HANDLE;
afe_config_t afe_config = AFE_CONFIG_DEFAULT();
afe_config.wakenet_model_name = wn_name;
afe_data = afe_handle->create_from_config(&afe_config);
if (!afe_data) {
printf("afe_data is null!\n");
return;
}
s_cpu_test_task_flag = 1;
xTaskCreatePinnedToCore(&test_feed_Task, "feed", 8 * 1024, (void *)afe_handle, 5, NULL, 0);
xTaskCreatePinnedToCore(&test_detect_Task, "detect", 8 * 1024, (void *)afe_handle, 5, NULL, 1);
xTaskCreatePinnedToCore(&test_print_cpuloading, "cpuloading", 4 * 1024, NULL, 5, NULL, 1);
vTaskDelay(10000 / portTICK_PERIOD_MS);
s_cpu_test_task_flag = 0;
vTaskDelay(2000 / portTICK_PERIOD_MS);
ESP_LOGI(TAG, "destroy\n");
afe_handle->destroy(afe_data);
afe_data = NULL;
esp_srmodel_deinit(models);
ESP_LOGI(TAG, "successful\n");
}
/******************************************** Divide VC Test ********************************************/
TEST_CASE("audio_front_end VC create/destroy API & memory leak", "[afe_vc]")
TEST_CASE("afe performance test (2ch)", "[afe_perf]")
{
int start_total_mem_size = 0;
int start_internal_mem_size = 0;
int start_spiram_mem_size = 0;
int end_total_mem_size = 0;
int end_internal_mem_size = 0;
int end_spiram_mem_size = 0;
const char *input_format = "MMR";
afe_type_t afe_type = AFE_TYPE_VC;
afe_mode_t afe_model[2] = {AFE_MODE_HIGH_PERF, AFE_MODE_LOW_COST};
int audio_chunksize = 0;
int16_t *feed_buff = NULL;
// test all setting
srmodel_list_t *models = esp_srmodel_init("model");
esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)&ESP_AFE_VC_HANDLE;
afe_config_t afe_config = AFE_CONFIG_DEFAULT();
afe_config.wakenet_init = false;
afe_config.voice_communication_init = true;
for (int aec_init = 0; aec_init < 2; aec_init++) {
for (int se_init = 0; se_init < 2; se_init++) {
for (int vad_init = 0; vad_init < 2; vad_init++) {
for (int voice_communication_agc_init = 0; voice_communication_agc_init < 2; voice_communication_agc_init++) {
#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
for (int afe_ns_mode = 0; afe_ns_mode < 2; afe_ns_mode++) {
#else
int afe_ns_mode = NS_MODE_SSP;
#endif
printf("aec_init: %d, se_init: %d, vad_init: %d, voice_communication_agc_init: %d, afe_ns_mode: %d\n", aec_init, se_init, vad_init, voice_communication_agc_init, afe_ns_mode);
afe_config.aec_init = aec_init;
afe_config.se_init = se_init;
afe_config.vad_init = vad_init;
afe_config.voice_communication_agc_init = voice_communication_agc_init;
afe_config.afe_ns_mode = (afe_ns_mode_t)afe_ns_mode;
//start_total_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
//start_internal_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
//start_spiram_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
for (int i = 0; i < 2; i++) {
printf("index: %d\n", i);
vTaskDelay(500 / portTICK_PERIOD_MS);
start_total_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
start_internal_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
start_spiram_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
srmodel_list_t *models = esp_srmodel_init("model");
char *nsnet_name = NULL;
#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
nsnet_name = esp_srmodel_filter(models, ESP_NSNET_PREFIX, NULL);
#endif
printf("nsnet_name: %s\n", nsnet_name ? nsnet_name : "");
afe_config.afe_ns_model_name = nsnet_name;
afe_data = afe_handle->create_from_config(&afe_config);
if (!afe_data) {
printf("afe_data is null\n");
continue;
}
audio_chunksize = afe_handle->get_feed_chunksize(afe_data);
feed_buff = (int16_t *) malloc(audio_chunksize * sizeof(int16_t) * afe_config.pcm_config.total_ch_num);
assert(feed_buff);
afe_handle->feed(afe_data, feed_buff);
afe_handle->destroy(afe_data);
afe_data = NULL;
if (feed_buff) {
free(feed_buff);
feed_buff = NULL;
}
esp_srmodel_deinit(models);
vTaskDelay(1000 / portTICK_PERIOD_MS);
end_total_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
end_internal_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
end_spiram_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
printf("memory leak: %d\n", start_total_mem_size - end_total_mem_size);
if (i > 0) { // skip index = 0
TEST_ASSERT_EQUAL(start_internal_mem_size, end_internal_mem_size);
TEST_ASSERT_EQUAL(start_spiram_mem_size, end_spiram_mem_size);
TEST_ASSERT_EQUAL(start_total_mem_size, end_total_mem_size);
} else {
TEST_ASSERT_EQUAL(true, (start_total_mem_size - end_total_mem_size) < 1000);
}
}
#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
}
#endif
}
for (int mode_id=0; mode_id<2; mode_id++) {
afe_config_t *afe_config = afe_config_init(input_format, models, afe_type, afe_model[mode_id]);
if (afe_config->wakenet_init && afe_config->wakenet_model_name) {
esp_afe_sr_iface_t *afe_handle = esp_afe_handle_from_config(afe_config);
esp_afe_sr_data_t *afe_data = afe_handle->create_from_config(afe_config);
afe_task_into_t task_info;
task_info.afe_data = afe_data;
task_info.afe_handle = afe_handle;
task_info.feed_task = NULL;
task_info.fetch_task = NULL;
fetch_task_flag = 1;
xTaskCreatePinnedToCore(&test_feed_Task, "feed_task", 8 * 1024, (void *)(&task_info), 5, &task_info.feed_task, 0);
xTaskCreatePinnedToCore(&test_fetch_Task, "fetch_task", 8 * 1024, (void *)(&task_info), 5, &task_info.fetch_task, 0);
while (fetch_task_flag) {
vTaskDelay(32 / portTICK_PERIOD_MS);
}
}
afe_config_free(afe_config);
}
}
TEST_CASE("audio_front_end VC cpu loading and memory info", "[afe_vc]")
{
total_ram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT);
internal_ram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
psram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
srmodel_list_t *models = esp_srmodel_init("model");
char *nsnet_name = NULL;
#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
nsnet_name = esp_srmodel_filter(models, ESP_NSNET_PREFIX, NULL);
#endif
printf("nsnet_name: %s\n", nsnet_name ? nsnet_name : "");
esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)&ESP_AFE_VC_HANDLE;
afe_config_t afe_config = AFE_CONFIG_DEFAULT();
afe_config.wakenet_init = false;
afe_config.voice_communication_init = true;
afe_config.voice_communication_agc_init = true;
#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
afe_config.afe_ns_mode = NS_MODE_NET;
#else
afe_config.afe_ns_mode = NS_MODE_SSP;
#endif
afe_config.afe_ns_model_name = nsnet_name;
afe_data = afe_handle->create_from_config(&afe_config);
if (!afe_data) {
printf("afe_data is null!\n");
return;
}
s_cpu_test_task_flag = 1;
xTaskCreatePinnedToCore(&test_feed_Task, "feed", 8 * 1024, (void *)afe_handle, 5, NULL, 0);
xTaskCreatePinnedToCore(&test_detect_Task, "detect", 8 * 1024, (void *)afe_handle, 5, NULL, 1);
xTaskCreatePinnedToCore(&test_print_cpuloading, "cpuloading", 4 * 1024, NULL, 5, NULL, 1);
vTaskDelay(20000 / portTICK_PERIOD_MS);
s_cpu_test_task_flag = 0;
vTaskDelay(2000 / portTICK_PERIOD_MS);
ESP_LOGI(TAG, "destroy\n");
afe_handle->destroy(afe_data);
esp_srmodel_deinit(models);
afe_data = NULL;
ESP_LOGI(TAG, "successful\n");
}
}

View File

@ -35,7 +35,6 @@ def test_multinet_p4(dut: Dut)-> None:
@pytest.mark.parametrize(
'config',
[
'mn5q8_en',
'wn9_hilexin',
],
)
@ -47,8 +46,7 @@ def test_wakenet(dut: Dut)-> None:
@pytest.mark.parametrize(
'config',
[
'p4_mn7_en',
'p4_nsnet2',
'p4_wn9_hilexin',
],
)
def test_wakenet_p4(dut: Dut)-> None:
@ -59,44 +57,21 @@ def test_wakenet_p4(dut: Dut)-> None:
@pytest.mark.parametrize(
'config',
[
'afe',
'wn9_hilexin',
'vadnet',
],
)
def test_sr_afe(dut: Dut)-> None:
dut.run_all_single_board_cases(group="afe_sr", timeout=100000)
dut.run_all_single_board_cases(group="afe", timeout=3600)
@pytest.mark.target('esp32p4')
@pytest.mark.env('esp32p4')
@pytest.mark.parametrize(
'config',
[
'p4_mn7_cn',
'p4_afe',
'p4_wn9_hilexin',
],
)
def test_sr_afe_p4(dut: Dut)-> None:
dut.run_all_single_board_cases(group="afe_sr", timeout=100000)
@pytest.mark.target('esp32s3')
@pytest.mark.env('esp32s3')
@pytest.mark.parametrize(
'config',
[
'nsnet2',
],
)
def test_vc_afe(dut: Dut)-> None:
dut.run_all_single_board_cases(group="afe_vc", timeout=100000)
@pytest.mark.target('esp32p4')
@pytest.mark.env('esp32p4')
@pytest.mark.parametrize(
'config',
[
'p4_nsnet2',
],
)
def test_vc_afe_p4(dut: Dut)-> None:
dut.run_all_single_board_cases(group="afe_vc", timeout=100000)
dut.run_all_single_board_cases(group="afe", timeout=3600)

View File

@ -2,20 +2,22 @@
# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration
#
CONFIG_IDF_TARGET="esp32s3"
CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16
CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
CONFIG_PARTITION_TABLE_CUSTOM=y
CONFIG_SR_VADN_VADNET1_MEDIUM=y
CONFIG_SR_WN_WN9_HILEXIN=y
CONFIG_SR_NSN_NSNET2=y
CONFIG_SPIRAM=y
CONFIG_ESP_TASK_WDT_EN=n
CONFIG_ESP_TASK_WDT_INIT=n
CONFIG_ESP_MAIN_TASK_STACK_SIZE=10240
CONFIG_SPIRAM_MODE_OCT=y
CONFIG_SPIRAM_SPEED_80M=y
CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y
CONFIG_ESP32S3_INSTRUCTION_CACHE_32KB=y
CONFIG_ESP32S3_DATA_CACHE_64KB=y
CONFIG_ESP32S3_DATA_CACHE_LINE_64B=y
CONFIG_ESP_MAIN_TASK_STACK_SIZE=8192
CONFIG_ESP_WIFI_GMAC_SUPPORT=n
CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID=y
CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS=y

View File

@ -1,5 +1,5 @@
# This file was generated using idf.py save-defconfig. It can be edited manually.
# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration
# Espressif IoT Development Framework (ESP-IDF) 5.3.1 Project Minimal Configuration
#
CONFIG_IDF_TARGET="esp32"
CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16
@ -9,6 +9,10 @@ CONFIG_ESPTOOLPY_FLASHSIZE_8MB=y
CONFIG_PARTITION_TABLE_CUSTOM=y
CONFIG_PARTITION_TABLE_CUSTOM_FILENAME="partitions_esp32.csv"
CONFIG_SR_MN_CN_MULTINET2_SINGLE_RECOGNITION=y
CONFIG_COMPILER_OPTIMIZATION_PERF=y
CONFIG_SPIRAM=y
CONFIG_SPIRAM_SPEED_80M=y
CONFIG_ESP_INT_WDT_TIMEOUT_MS=1000
CONFIG_ESP_WIFI_GMAC_SUPPORT=n
CONFIG_LWIP_TCP_SND_BUF_DEFAULT=5744
CONFIG_LWIP_TCP_WND_DEFAULT=5744

View File

@ -1,23 +0,0 @@
# This file was generated using idf.py save-defconfig. It can be edited manually.
# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration
#
CONFIG_IDF_TARGET="esp32s3"
CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16
CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
CONFIG_PARTITION_TABLE_CUSTOM=y
CONFIG_SR_NSN_NSNET2=y
CONFIG_SPIRAM=y
CONFIG_SPIRAM_MODE_OCT=y
CONFIG_SPIRAM_SPEED_80M=y
CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y
CONFIG_ESP32S3_INSTRUCTION_CACHE_32KB=y
CONFIG_ESP32S3_DATA_CACHE_64KB=y
CONFIG_ESP32S3_DATA_CACHE_LINE_64B=y
CONFIG_ESP_MAIN_TASK_STACK_SIZE=8192
CONFIG_ESP_WIFI_GMAC_SUPPORT=n
CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID=y
CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS=y
CONFIG_LWIP_TCP_SND_BUF_DEFAULT=5744
CONFIG_LWIP_TCP_WND_DEFAULT=5744
CONFIG_UNITY_CRITICAL_LEAK_LEVEL_GENERAL=1024

View File

@ -0,0 +1,23 @@
# This file was generated using idf.py save-defconfig. It can be edited manually.
# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration
#
CONFIG_IDF_TARGET="esp32p4"
CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
CONFIG_PARTITION_TABLE_CUSTOM=y
CONFIG_SR_VADN_VADNET1_MEDIUM=y
CONFIG_SR_WN_WN9_HILEXIN=y
CONFIG_SR_NSN_NSNET2=y
CONFIG_SPIRAM=y
CONFIG_ESP_TASK_WDT_EN=n
CONFIG_ESP_TASK_WDT_INIT=n
CONFIG_ESP_MAIN_TASK_STACK_SIZE=10240
CONFIG_COMPILER_OPTIMIZATION_PERF=y
CONFIG_ESP32P4_REV_MIN_0=y
CONFIG_SPIRAM=y
CONFIG_SPIRAM_SPEED_200M=y
CONFIG_CACHE_L2_CACHE_256KB=y
CONFIG_CACHE_L2_CACHE_LINE_128B=y
CONFIG_ESP_SYSTEM_ALLOW_RTC_FAST_MEM_AS_HEAP=n
CONFIG_MBEDTLS_CMAC_C=y
CONFIG_IDF_EXPERIMENTAL_FEATURES=y

View File

@ -1,10 +1,12 @@
# This file was generated using idf.py save-defconfig. It can be edited manually.
# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration
# Espressif IoT Development Framework (ESP-IDF) 5.3.1 Project Minimal Configuration
#
CONFIG_IDF_TARGET="esp32p4"
CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
CONFIG_PARTITION_TABLE_CUSTOM=y
CONFIG_SR_NSN_NSNET2=y
CONFIG_SR_VADN_VADNET1_MEDIUM=y
CONFIG_SR_WN_WN9_HILEXIN=y
CONFIG_SR_MN_CN_MULTINET7_QUANT=y
CONFIG_COMPILER_OPTIMIZATION_PERF=y
@ -14,7 +16,6 @@ CONFIG_SPIRAM_SPEED_200M=y
CONFIG_CACHE_L2_CACHE_256KB=y
CONFIG_CACHE_L2_CACHE_LINE_128B=y
CONFIG_ESP_SYSTEM_ALLOW_RTC_FAST_MEM_AS_HEAP=n
CONFIG_ESP_MAIN_TASK_STACK_SIZE=8000
CONFIG_ESP_INT_WDT=n
CONFIG_ESP_TASK_WDT_EN=n
CONFIG_FREERTOS_HZ=1000

View File

@ -1,10 +1,12 @@
# This file was generated using idf.py save-defconfig. It can be edited manually.
# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration
# Espressif IoT Development Framework (ESP-IDF) 5.3.1 Project Minimal Configuration
#
CONFIG_IDF_TARGET="esp32p4"
CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
CONFIG_PARTITION_TABLE_CUSTOM=y
CONFIG_SR_NSN_NSNET2=y
CONFIG_SR_VADN_VADNET1_MEDIUM=y
CONFIG_SR_WN_WN9_HIESP=y
CONFIG_SR_MN_EN_MULTINET7_QUANT=y
CONFIG_COMPILER_OPTIMIZATION_PERF=y

View File

@ -1,22 +1,20 @@
# This file was generated using idf.py save-defconfig. It can be edited manually.
# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration
# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration
#
CONFIG_IDF_TARGET="esp32p4"
CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
CONFIG_PARTITION_TABLE_CUSTOM=y
CONFIG_SR_WN_WN9_HIESP=y
CONFIG_SR_NSN_NSNET2=y
CONFIG_SR_WN_WN9_HILEXIN=y
CONFIG_SPIRAM=y
CONFIG_ESP_TASK_WDT_EN=n
CONFIG_ESP_TASK_WDT_INIT=n
CONFIG_ESP_MAIN_TASK_STACK_SIZE=10240
CONFIG_COMPILER_OPTIMIZATION_PERF=y
CONFIG_ESP32P4_REV_MIN_0=y
CONFIG_SPIRAM=y
CONFIG_SPIRAM_SPEED_200M=y
CONFIG_CACHE_L2_CACHE_256KB=y
CONFIG_CACHE_L2_CACHE_LINE_128B=y
CONFIG_ESP_SYSTEM_ALLOW_RTC_FAST_MEM_AS_HEAP=n
CONFIG_ESP_MAIN_TASK_STACK_SIZE=10000
CONFIG_ESP_INT_WDT=n
CONFIG_ESP_TASK_WDT_EN=n
CONFIG_FREERTOS_HZ=1000
CONFIG_MBEDTLS_CMAC_C=y
CONFIG_IDF_EXPERIMENTAL_FEATURES=y

View File

@ -2,13 +2,13 @@
# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration
#
CONFIG_IDF_TARGET="esp32s3"
CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16
CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
CONFIG_PARTITION_TABLE_CUSTOM=y
CONFIG_SR_WN_WN9_HILEXIN=y
CONFIG_ESP_PHY_REDUCE_TX_POWER=y
CONFIG_SPIRAM=y
CONFIG_ESP_TASK_WDT_EN=n
CONFIG_ESP_TASK_WDT_INIT=n
CONFIG_SPIRAM_MODE_OCT=y
CONFIG_SPIRAM_SPEED_80M=y
CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y
@ -21,4 +21,4 @@ CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID=y
CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS=y
CONFIG_LWIP_TCP_SND_BUF_DEFAULT=5744
CONFIG_LWIP_TCP_WND_DEFAULT=5744
CONFIG_UNITY_CRITICAL_LEAK_LEVEL_GENERAL=1024
CONFIG_UNITY_CRITICAL_LEAK_LEVEL_GENERAL=1024

View File

@ -0,0 +1,6 @@
# This file was generated using idf.py save-defconfig. It can be edited manually.
# Espressif IoT Development Framework (ESP-IDF) 5.3.1 Project Minimal Configuration
#
CONFIG_PARTITION_TABLE_CUSTOM=y
CONFIG_SR_VADN_VADNET1_MEDIUM=y