diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4c1346d..9a1ce03 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -315,4 +315,4 @@ push_to_github: - echo -e "Host github.com\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config - git remote remove github &>/dev/null || true - git remote add github git@github.com:espressif/esp-sr.git - - git push github "${CI_COMMIT_SHA}:refs/heads/${CI_COMMIT_REF_NAME}" + - git push github "${CI_COMMIT_SHA}:refs/heads/${CI_COMMIT_REF_NAME}" \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index 5f768f9..e9da512 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,13 +1,14 @@ -if(IDF_TARGET STREQUAL "esp32") +if((${IDF_TARGET} STREQUAL "esp32s3") OR (${IDF_TARGET} STREQUAL "esp32p4") OR (${IDF_TARGET} STREQUAL "esp32")) set(include_dirs - src/include - esp-tts/esp_tts_chinese/include - include/esp32 + "esp-tts/esp_tts_chinese/include" + "include/${IDF_TARGET}" + "src/include" ) + set(srcs - src/model_path.c - src/esp_mn_speech_commands.c - src/esp_process_sdkconfig.c + "src/model_path.c" + "src/esp_mn_speech_commands.c" + "src/esp_process_sdkconfig.c" ) set(requires @@ -20,244 +21,320 @@ if(IDF_TARGET STREQUAL "esp32") ENDIF (IDF_VERSION_MAJOR GREATER 4) idf_component_register(SRCS ${srcs} - INCLUDE_DIRS ${include_dirs} - REQUIRES ${requires} - PRIV_REQUIRES spi_flash) + INCLUDE_DIRS ${include_dirs} + REQUIRES ${requires} + PRIV_REQUIRES spi_flash) - target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32") - target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32") - add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME}) - - target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group" - multinet - dl_lib - c_speech_features - wakeword_model - multinet2_ch - esp_audio_processor - esp_audio_front_end - esp_tts_chinese - voice_set_xiaole - wakenet - "-Wl,--end-group") -elseif(${IDF_TARGET} STREQUAL "esp32s3") - set(include_dirs - src/include - esp-tts/esp_tts_chinese/include - include/esp32s3 - ) - set(srcs - src/model_path.c - src/esp_mn_speech_commands.c - src/esp_process_sdkconfig.c - ) - - set(requires - json - spiffs - ) - - IF (IDF_VERSION_MAJOR GREATER 4) - list(APPEND requires esp_partition) - ENDIF (IDF_VERSION_MAJOR GREATER 4) - - idf_component_register(SRCS ${srcs} - INCLUDE_DIRS ${include_dirs} - REQUIRES ${requires} - PRIV_REQUIRES spi_flash) - - target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3") - target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32s3") - - add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(nsnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libnsnet.a" PRIV_REQUIRES ${COMPONENT_NAME}) + target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}") + target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}") + add_prebuilt_library(dl_lib "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libdl_lib.a" PRIV_REQUIRES ${COMPONENT_NAME}) + add_prebuilt_library(c_speech_features "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libc_speech_features.a" PRIV_REQUIRES ${COMPONENT_NAME}) + add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME}) + add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME}) + add_prebuilt_library(esp_tts_chinese "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libesp_tts_chinese.a" PRIV_REQUIRES ${COMPONENT_NAME}) + add_prebuilt_library(voice_set_xiaole "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libvoice_set_xiaole.a" PRIV_REQUIRES ${COMPONENT_NAME}) + add_prebuilt_library(fst "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libfst.a" PRIV_REQUIRES ${COMPONENT_NAME}) + add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME}) + add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME}) + add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME}) + add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME}) + add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME}) + add_prebuilt_library(nsnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libnsnet.a" PRIV_REQUIRES ${COMPONENT_NAME}) idf_component_get_property(esp_dsp_lib espressif__esp-dsp COMPONENT_LIB) - target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group" - hufzip + set(sr_libs dl_lib - fst - c_speech_features $ + c_speech_features esp_audio_front_end esp_audio_processor - multinet - flite_g2p esp_tts_chinese voice_set_xiaole + fst + flite_g2p + hufzip + multinet nsnet vadnet - wakenet - "-Wl,--end-group") + wakenet) - set(MVMODEL_EXE ${COMPONENT_PATH}/model/movemodel.py) - idf_build_get_property(build_dir BUILD_DIR) - set(image_file ${build_dir}/srmodels/srmodels.bin) - - add_custom_command( - OUTPUT ${image_file} - COMMENT "Move and Pack models..." - COMMAND python ${MVMODEL_EXE} -d1 ${SDKCONFIG} -d2 ${COMPONENT_PATH} -d3 ${build_dir} - DEPENDS ${SDKCONFIG} - VERBATIM) - - add_custom_target(srmodels_bin ALL DEPENDS ${image_file}) - add_dependencies(flash srmodels_bin) - - partition_table_get_partition_info(size "--partition-name model" "size") - partition_table_get_partition_info(offset "--partition-name model" "offset") - - if("${size}" AND "${offset}") - esptool_py_flash_to_partition(flash "model" "${image_file}") - else() - set(message "Failed to find model in partition table file" - "Please add a line(Name=model, Size>recommended size in log) to the partition file.") + if(${IDF_TARGET} STREQUAL "esp32") + add_prebuilt_library(multinet2_ch "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libmultinet2_ch.a" PRIV_REQUIRES ${COMPONENT_NAME}) + list(APPEND sr_libs multinet2_ch) endif() -elseif(${IDF_TARGET} STREQUAL "esp32p4") - set(include_dirs - src/include - esp-tts/esp_tts_chinese/include - include/esp32p4 - ) - set(srcs - src/model_path.c - src/esp_mn_speech_commands.c - src/esp_process_sdkconfig.c - ) - - set(requires - json - spiffs - ) - - IF (IDF_VERSION_MAJOR GREATER 4) - list(APPEND requires esp_partition) - ENDIF (IDF_VERSION_MAJOR GREATER 4) - - idf_component_register(SRCS ${srcs} - INCLUDE_DIRS ${include_dirs} - REQUIRES ${requires} - PRIV_REQUIRES spi_flash) - - target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4") - target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32p4") - - add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME}) - add_prebuilt_library(nsnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libnsnet.a" PRIV_REQUIRES ${COMPONENT_NAME}) - - idf_component_get_property(esp_dsp_lib espressif__esp-dsp COMPONENT_LIB) target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group" - hufzip - dl_lib - fst - c_speech_features - $ - esp_audio_front_end - esp_audio_processor - multinet - flite_g2p - esp_tts_chinese - voice_set_xiaole - wakenet - vadnet - nsnet + ${sr_libs} "-Wl,--end-group") - set(MVMODEL_EXE ${COMPONENT_PATH}/model/movemodel.py) - idf_build_get_property(build_dir BUILD_DIR) - set(image_file ${build_dir}/srmodels/srmodels.bin) - add_custom_command( - OUTPUT ${image_file} - COMMENT "Move and Pack models..." - COMMAND python ${MVMODEL_EXE} -d1 ${SDKCONFIG} -d2 ${COMPONENT_PATH} -d3 ${build_dir} - DEPENDS ${SDKCONFIG} - VERBATIM) + if(CONFIG_IDF_TARGET_ESP32S3 OR CONFIG_IDF_TARGET_ESP32P4) + set(MVMODEL_EXE ${COMPONENT_PATH}/model/movemodel.py) + idf_build_get_property(build_dir BUILD_DIR) + set(image_file ${build_dir}/srmodels/srmodels.bin) - add_custom_target(srmodels_bin ALL DEPENDS ${image_file}) - add_dependencies(flash srmodels_bin) + add_custom_command( + OUTPUT ${image_file} + COMMENT "Move and Pack models..." + COMMAND python ${MVMODEL_EXE} -d1 ${SDKCONFIG} -d2 ${COMPONENT_PATH} -d3 ${build_dir} + DEPENDS ${SDKCONFIG} + VERBATIM) - partition_table_get_partition_info(size "--partition-name model" "size") - partition_table_get_partition_info(offset "--partition-name model" "offset") + add_custom_target(srmodels_bin ALL DEPENDS ${image_file}) + add_dependencies(flash srmodels_bin) - if("${size}" AND "${offset}") - esptool_py_flash_to_partition(flash "model" "${image_file}") - else() - set(message "Failed to find model in partition table file" - "Please add a line(Name=model, Size>recommended size in log) to the partition file.") + partition_table_get_partition_info(size "--partition-name model" "size") + partition_table_get_partition_info(offset "--partition-name model" "offset") + + if("${size}" AND "${offset}") + esptool_py_flash_to_partition(flash "model" "${image_file}") + else() + set(message "Failed to find model in partition table file" + "Please add a line(Name=model, Size>recommended size in log) to the partition file.") + endif() endif() -elseif(${IDF_TARGET} STREQUAL "esp32s2") - set(requires + +elseif((${IDF_TARGET} STREQUAL "esp32s2") OR (${IDF_TARGET} STREQUAL "esp32c3") OR (${IDF_TARGET} STREQUAL "esp32c6")) + +set(requires spiffs ) - IF (IDF_VERSION_MAJOR GREATER 4) - list(APPEND requires esp_partition) - ENDIF (IDF_VERSION_MAJOR GREATER 4) +IF (IDF_VERSION_MAJOR GREATER 4) + list(APPEND requires esp_partition) +ENDIF (IDF_VERSION_MAJOR GREATER 4) - idf_component_register(SRCS . - INCLUDE_DIRS esp-tts/esp_tts_chinese/include - REQUIRES ${requires} - PRIV_REQUIRES spi_flash) +idf_component_register(SRCS . + INCLUDE_DIRS esp-tts/esp_tts_chinese/include + REQUIRES ${requires} + PRIV_REQUIRES spi_flash) - target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32s2") - target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group" - esp_tts_chinese - voice_set_xiaole - "-Wl,--end-group") -elseif(${IDF_TARGET} STREQUAL "esp32c3") - set(requires - spiffs - ) +target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}") +add_prebuilt_library(esp_tts_chinese "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libesp_tts_chinese.a" PRIV_REQUIRES ${COMPONENT_NAME}) +add_prebuilt_library(voice_set_xiaole "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libvoice_set_xiaole.a" PRIV_REQUIRES ${COMPONENT_NAME}) +target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group" + esp_tts_chinese + voice_set_xiaole + "-Wl,--end-group") - IF (IDF_VERSION_MAJOR GREATER 4) - list(APPEND requires esp_partition) - ENDIF (IDF_VERSION_MAJOR GREATER 4) - - idf_component_register(SRCS . - INCLUDE_DIRS esp-tts/esp_tts_chinese/include - REQUIRES ${requires} - PRIV_REQUIRES spi_flash) - - target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32c3") - target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group" - esp_tts_chinese - voice_set_xiaole - "-Wl,--end-group") -elseif(${IDF_TARGET} STREQUAL "esp32c6") - set(requires - spiffs - ) - - IF (IDF_VERSION_MAJOR GREATER 4) - list(APPEND requires esp_partition) - ENDIF (IDF_VERSION_MAJOR GREATER 4) - - idf_component_register(SRCS . - INCLUDE_DIRS esp-tts/esp_tts_chinese/include - REQUIRES ${requires} - PRIV_REQUIRES spi_flash) - - target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32c6") - target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group" - esp_tts_chinese - voice_set_xiaole - "-Wl,--end-group") endif() + +# elseif(${IDF_TARGET} STREQUAL "esp32s3") +# set(include_dirs +# src/include +# esp-tts/esp_tts_chinese/include +# include/esp32s3 +# ) +# set(srcs +# src/model_path.c +# src/esp_mn_speech_commands.c +# src/esp_process_sdkconfig.c +# ) + +# set(requires +# json +# spiffs +# ) + +# IF (IDF_VERSION_MAJOR GREATER 4) +# list(APPEND requires esp_partition) +# ENDIF (IDF_VERSION_MAJOR GREATER 4) + +# idf_component_register(SRCS ${srcs} +# INCLUDE_DIRS ${include_dirs} +# REQUIRES ${requires} +# PRIV_REQUIRES spi_flash) + +# target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3") +# target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32s3") + +# add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME}) +# add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME}) +# add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME}) +# add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME}) +# add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME}) +# add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME}) +# add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME}) +# add_prebuilt_library(nsnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libnsnet.a" PRIV_REQUIRES ${COMPONENT_NAME}) + +# idf_component_get_property(esp_dsp_lib espressif__esp-dsp COMPONENT_LIB) + +# target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group" +# hufzip +# dl_lib +# fst +# c_speech_features +# $ +# esp_audio_front_end +# esp_audio_processor +# multinet +# flite_g2p +# esp_tts_chinese +# voice_set_xiaole +# nsnet +# vadnet +# wakenet +# "-Wl,--end-group") + +# set(MVMODEL_EXE ${COMPONENT_PATH}/model/movemodel.py) +# idf_build_get_property(build_dir BUILD_DIR) +# set(image_file ${build_dir}/srmodels/srmodels.bin) + +# add_custom_command( +# OUTPUT ${image_file} +# COMMENT "Move and Pack models..." +# COMMAND python ${MVMODEL_EXE} -d1 ${SDKCONFIG} -d2 ${COMPONENT_PATH} -d3 ${build_dir} +# DEPENDS ${SDKCONFIG} +# VERBATIM) + +# add_custom_target(srmodels_bin ALL DEPENDS ${image_file}) +# add_dependencies(flash srmodels_bin) + +# partition_table_get_partition_info(size "--partition-name model" "size") +# partition_table_get_partition_info(offset "--partition-name model" "offset") + +# if("${size}" AND "${offset}") +# esptool_py_flash_to_partition(flash "model" "${image_file}") +# else() +# set(message "Failed to find model in partition table file" +# "Please add a line(Name=model, Size>recommended size in log) to the partition file.") +# endif() +# elseif(${IDF_TARGET} STREQUAL "esp32p4") +# set(include_dirs +# src/include +# esp-tts/esp_tts_chinese/include +# include/esp32p4 +# ) +# set(srcs +# src/model_path.c +# src/esp_mn_speech_commands.c +# src/esp_process_sdkconfig.c +# ) + +# set(requires +# json +# spiffs +# ) + +# IF (IDF_VERSION_MAJOR GREATER 4) +# list(APPEND requires esp_partition) +# ENDIF (IDF_VERSION_MAJOR GREATER 4) + +# idf_component_register(SRCS ${srcs} +# INCLUDE_DIRS ${include_dirs} +# REQUIRES ${requires} +# PRIV_REQUIRES spi_flash) + +# target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4") +# target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32p4") + +# add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME}) +# add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME}) +# add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME}) +# add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME}) +# add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME}) +# add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME}) +# add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME}) +# add_prebuilt_library(nsnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libnsnet.a" PRIV_REQUIRES ${COMPONENT_NAME}) + +# idf_component_get_property(esp_dsp_lib espressif__esp-dsp COMPONENT_LIB) + +# target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group" +# hufzip +# dl_lib +# fst +# c_speech_features +# $ +# esp_audio_front_end +# esp_audio_processor +# multinet +# flite_g2p +# esp_tts_chinese +# voice_set_xiaole +# wakenet +# vadnet +# nsnet +# "-Wl,--end-group") + +# set(MVMODEL_EXE ${COMPONENT_PATH}/model/movemodel.py) +# idf_build_get_property(build_dir BUILD_DIR) +# set(image_file ${build_dir}/srmodels/srmodels.bin) + +# add_custom_command( +# OUTPUT ${image_file} +# COMMENT "Move and Pack models..." +# COMMAND python ${MVMODEL_EXE} -d1 ${SDKCONFIG} -d2 ${COMPONENT_PATH} -d3 ${build_dir} +# DEPENDS ${SDKCONFIG} +# VERBATIM) + +# add_custom_target(srmodels_bin ALL DEPENDS ${image_file}) +# add_dependencies(flash srmodels_bin) + +# partition_table_get_partition_info(size "--partition-name model" "size") +# partition_table_get_partition_info(offset "--partition-name model" "offset") + +# if("${size}" AND "${offset}") +# esptool_py_flash_to_partition(flash "model" "${image_file}") +# else() +# set(message "Failed to find model in partition table file" +# "Please add a line(Name=model, Size>recommended size in log) to the partition file.") +# endif() +# elseif(${IDF_TARGET} STREQUAL "esp32s2") +# set(requires +# spiffs +# ) + +# IF (IDF_VERSION_MAJOR GREATER 4) +# list(APPEND requires esp_partition) +# ENDIF (IDF_VERSION_MAJOR GREATER 4) + +# idf_component_register(SRCS . +# INCLUDE_DIRS esp-tts/esp_tts_chinese/include +# REQUIRES ${requires} +# PRIV_REQUIRES spi_flash) + +# target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32s2") +# target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group" +# esp_tts_chinese +# voice_set_xiaole +# "-Wl,--end-group") +# elseif(${IDF_TARGET} STREQUAL "esp32c3") +# set(requires +# spiffs +# ) + +# IF (IDF_VERSION_MAJOR GREATER 4) +# list(APPEND requires esp_partition) +# ENDIF (IDF_VERSION_MAJOR GREATER 4) + +# idf_component_register(SRCS . +# INCLUDE_DIRS esp-tts/esp_tts_chinese/include +# REQUIRES ${requires} +# PRIV_REQUIRES spi_flash) + +# target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32c3") +# target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group" +# esp_tts_chinese +# voice_set_xiaole +# "-Wl,--end-group") +# elseif(${IDF_TARGET} STREQUAL "esp32c6") +# set(requires +# spiffs +# ) + +# IF (IDF_VERSION_MAJOR GREATER 4) +# list(APPEND requires esp_partition) +# ENDIF (IDF_VERSION_MAJOR GREATER 4) + +# idf_component_register(SRCS . +# INCLUDE_DIRS esp-tts/esp_tts_chinese/include +# REQUIRES ${requires} +# PRIV_REQUIRES spi_flash) + +# target_link_libraries(${COMPONENT_TARGET} INTERFACE "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/esp32c6") +# target_link_libraries(${COMPONENT_TARGET} INTERFACE "-Wl,--start-group" +# esp_tts_chinese +# voice_set_xiaole +# "-Wl,--end-group") +# endif() diff --git a/docs/en/benchmark/README.rst b/docs/en/benchmark/README.rst index 1c584be..0437387 100644 --- a/docs/en/benchmark/README.rst +++ b/docs/en/benchmark/README.rst @@ -45,6 +45,23 @@ Resource Consumption | AFE Layer | 227 KB | | | +-----------------+-----------------+-----------------+-----------------+ + + +--------------+------+-----------+---------------+------------+----------------+-----------------+ + | Input Format | Type | Mode | Internal RAM | PSRAM | Feed Task CPU | Fetch Task CPU | + +==============+======+===========+===============+============+================+=================+ + | MR | SR | LOW_COST | 72348 | 732932 | 8.4% | 14.9% | + +--------------+------+-----------+---------------+------------+----------------+-----------------+ + | MR | SR | HIGH_PERF | 78016 | 734980 | 9.4% | 14.9% | + +--------------+------+-----------+---------------+------------+----------------+-----------------+ + | MR | VC | LOW_COST | 50316 | 821564 | 60.0% | 8.1% | + +--------------+------+-----------+---------------+------------+----------------+-----------------+ + | MR | VC | HIGH_PERF | 93668 | 824144 | 64.0% | 8.2% | + +--------------+------+-----------+---------------+------------+----------------+-----------------+ + | MMR | SR | LOW_COST | 76684 | 1175148 | 36.6% | 30.2% | + +--------------+------+-----------+---------------+------------+----------------+-----------------+ + | MMR | SR | HIGH_PERF | 99064 | 1174960 | 38.8% | 30.0% | + +--------------+------+-----------+---------------+------------+----------------+-----------------+ + .. only:: esp32p4 +-----------------+-----------------+-----------------+-----------------+ @@ -52,21 +69,33 @@ Resource Consumption | | | loading(compute | | | | | with 2 cores) | | +=================+=================+=================+=================+ - | AEC(LOW_COST) | 152.3 KB | 8% | 32 ms | + | AEC(LOW_COST) | 152.3 KB | 6% | 32 ms | +-----------------+-----------------+-----------------+-----------------+ - | AEC(HIGH_PERF) | 166 KB | 11% | 32 ms | + | BSS(LOW_COST) | 198.7 KB | 3% | 64 ms | +-----------------+-----------------+-----------------+-----------------+ - | BSS(LOW_COST) | 198.7 KB | 6% | 64 ms | - +-----------------+-----------------+-----------------+-----------------+ - | BSS(HIGH_PERF) | 215.5 KB | 7% | 64 ms | - +-----------------+-----------------+-----------------+-----------------+ - | NS | 27 KB | 5% | 10 ms | + | NS | 27 KB | 3% | 10 ms | +-----------------+-----------------+-----------------+-----------------+ | MISO | 56 KB | 8% | 16 ms | +-----------------+-----------------+-----------------+-----------------+ | AFE Layer | 227 KB | | | +-----------------+-----------------+-----------------+-----------------+ + +--------------+------+-----------+---------------+------------+-----------------+-----------------+ + | Input Format | Type | Mode | Internal RAM | PSRAM | Feed Task CPU | Fetch Task CPU | + +==============+======+===========+===============+============+=================+=================+ + | MR | SR | LOW_COST | 75404 | 751292 | 10.6% | 11.3% | + +--------------+------+-----------+---------------+------------+-----------------+-----------------+ + | MR | SR | HIGH_PERF | 75128 | 751292 | 10.6% | 11.3% | + +--------------+------+-----------+---------------+------------+-----------------+-----------------+ + | MR | VC | LOW_COST | 76192 | 841300 | 40.3% | 5.7% | + +--------------+------+-----------+---------------+------------+-----------------+-----------------+ + | MR | VC | HIGH_PERF | 119536 | 843880 | 42.6% | 5.7% | + +--------------+------+-----------+---------------+------------+-----------------+-----------------+ + | MMR | SR | LOW_COST | 79940 | 1202692 | 28.4% | 24.9% | + +--------------+------+-----------+---------------+------------+-----------------+-----------------+ + | MMR | SR | HIGH_PERF | 79940 | 1202692 | 28.4% | 24.9% | + +--------------+------+-----------+---------------+------------+-----------------+-----------------+ + WakeNet ------- diff --git a/docs/zh_CN/benchmark/README.rst b/docs/zh_CN/benchmark/README.rst index 1a94847..5d10073 100644 --- a/docs/zh_CN/benchmark/README.rst +++ b/docs/zh_CN/benchmark/README.rst @@ -49,6 +49,22 @@ AFE | AFE Layer | 227 KB | | | +-----------------+-----------------+-----------------+-----------------+ + +--------------+------+-----------+---------------+------------+----------------+-----------------+ + | Input Format | Type | Mode | Internal RAM | PSRAM | Feed Task CPU | Fetch Task CPU | + +==============+======+===========+===============+============+================+=================+ + | MR | SR | LOW_COST | 72348 | 732932 | 8.4% | 14.9% | + +--------------+------+-----------+---------------+------------+----------------+-----------------+ + | MR | SR | HIGH_PERF | 78016 | 734980 | 9.4% | 14.9% | + +--------------+------+-----------+---------------+------------+----------------+-----------------+ + | MR | VC | LOW_COST | 50316 | 821564 | 60.0% | 8.1% | + +--------------+------+-----------+---------------+------------+----------------+-----------------+ + | MR | VC | HIGH_PERF | 93668 | 824144 | 64.0% | 8.2% | + +--------------+------+-----------+---------------+------------+----------------+-----------------+ + | MMR | SR | LOW_COST | 76684 | 1175148 | 36.6% | 30.2% | + +--------------+------+-----------+---------------+------------+----------------+-----------------+ + | MMR | SR | HIGH_PERF | 99064 | 1174960 | 38.8% | 30.0% | + +--------------+------+-----------+---------------+------------+----------------+-----------------+ + .. only:: esp32p4 +-----------------+-----------------+-----------------+-----------------+ @@ -67,6 +83,22 @@ AFE | AFE Layer | 227 KB | | | +-----------------+-----------------+-----------------+-----------------+ + +--------------+------+-----------+---------------+------------+-----------------+-----------------+ + | Input Format | Type | Mode | Internal RAM | PSRAM | Feed Task CPU | Fetch Task CPU | + +==============+======+===========+===============+============+=================+=================+ + | MR | SR | LOW_COST | 75404 | 751292 | 10.6% | 11.3% | + +--------------+------+-----------+---------------+------------+-----------------+-----------------+ + | MR | SR | HIGH_PERF | 75128 | 751292 | 10.6% | 11.3% | + +--------------+------+-----------+---------------+------------+-----------------+-----------------+ + | MR | VC | LOW_COST | 76192 | 841300 | 40.3% | 5.7% | + +--------------+------+-----------+---------------+------------+-----------------+-----------------+ + | MR | VC | HIGH_PERF | 119536 | 843880 | 42.6% | 5.7% | + +--------------+------+-----------+---------------+------------+-----------------+-----------------+ + | MMR | SR | LOW_COST | 79940 | 1202692 | 28.4% | 24.9% | + +--------------+------+-----------+---------------+------------+-----------------+-----------------+ + | MMR | SR | HIGH_PERF | 79940 | 1202692 | 28.4% | 24.9% | + +--------------+------+-----------+---------------+------------+-----------------+-----------------+ + WakeNet ------- diff --git a/include/esp32/esp_aec.h b/include/esp32/esp_aec.h index deb031c..36de9c1 100644 --- a/include/esp32/esp_aec.h +++ b/include/esp32/esp_aec.h @@ -21,80 +21,72 @@ extern "C" { #endif #define USE_AEC_FFT // Not kiss_fft -#define AEC_USE_SPIRAM 0 #define AEC_SAMPLE_RATE 16000 // Only Support 16000Hz -//#define AEC_FRAME_LENGTH_MS 16 #define AEC_FRAME_LENGTH_MS 32 -#define AEC_FILTER_LENGTH 1200 // Number of samples of echo to cancel -typedef void* aec_handle_t; +typedef struct aec_handle_t aec_handle_t; +typedef enum { + AEC_MODE_SR_LOW_COST = 0, // Low Cost AEC fro speech recognition + AEC_MODE_SR_HIGH_PERF = 1, // High Perforamce AEC for speech recognition + AEC_MODE_VOIP_LOW_COST = 3, // Low Cost AEC for voice communication + AEC_MODE_VOIP_HIGH_PERF = 4, // High Perforamce AEC for voice communication +} aec_mode_t; /** * @brief Creates an instance to the AEC structure. + * Please get frame size by aec_get_chunksize() function * - * @deprecated This API will be deprecated after version 1.0, please use aec_pro_create - * * @param sample_rate The Sampling frequency (Hz) must be 16000. - * - * @param frame_length The length of the audio processing must be 16ms. - * - * @param filter_length Number of samples of echo to cancel. - * + * @param filter_length Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption. + * @param channel_num The input microphone channel number + * @param mode The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST * @return * - NULL: Create failed * - Others: The instance of AEC */ -aec_handle_t aec_create(int sample_rate, int frame_length, int filter_length); +aec_handle_t *aec_create(int sample_rate, int filter_length, int channel_num, aec_mode_t mode); /** - * @brief Creates an instance to the AEC structure. + * @brief Creates an instance to the AEC structure, same with aec_create(). * - * @deprecated This API will be deprecated after version 1.0, please use aec_pro_create - * - * @param sample_rate The Sampling frequency (Hz) must be 16000. - * - * @param frame_length The length of the audio processing must be 16ms. - * - * @param filter_length Number of samples of echo to cancel. - * - * @param nch Number of input signal channel. - * + * @param filter_length Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption. + * @param channel_num The input microphone channel number + * @param mode The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST * @return * - NULL: Create failed * - Others: The instance of AEC */ -aec_handle_t aec_create_multimic(int sample_rate, int frame_length, int filter_length, int nch); - -/** - * @brief Creates an instance of more powerful AEC. - * - * @param frame_length Length of input signal. Must be 16ms if mode is 0; otherwise could be 16ms or 32ms. Length of input signal to aec_process must be modified accordingly. - * - * @param nch Number of microphones. - * - * @param mode Mode of AEC (0 to 5), indicating aggressiveness and RAM allocation. 0: mild; 1 or 2: medium (1: internal RAM, 2: SPIRAM); 3 and 4: aggressive (3: internal RAM, 4: SPIRAM); 5: agressive, accelerated for ESP32-S3. - * - * @return - * - NULL: Create failed - * - Others: An Instance of AEC - */ -aec_handle_t aec_pro_create(int frame_length, int nch, int mode); +aec_handle_t *aec_pro_create(int filter_length, int channel_num, aec_mode_t mode); /** * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic. * - * @param inst The instance of AEC. - * + * @warning The indata, refdata and outdata must be 16-bit signed. please allocate memory by heap_caps_aligned_alloc(). + * + * @param inst The instance of AEC. Format for multi-channel data is "ch0 ch0 ch0 ..., ch1 ch1 ch1 ..." * @param indata An array of 16-bit signed audio samples from mic. - * * @param refdata An array of 16-bit signed audio samples sent to the speaker. - * - * @param outdata Returns near-end signal with echo removed. - * + * @param outdata Returns near-end signal with echo removed. Format for multi-channel data is "ch0 ch0 ch0..., ch1 ch1 ch1 ..." * @return None * */ -void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int16_t *outdata); +void aec_process(const aec_handle_t *handel, int16_t *indata, int16_t *refdata, int16_t *outdata); + +/** + * @brief Get frame size of AEC (the samples of one frame) + * @param handle The instance of AEC. + * @return Frame size + */ +int aec_get_chunksize(const aec_handle_t *handle); + +/** + * @brief Get AEC mode string + * + * @param aec_mode The mode of AEC. + * + * @return AEC mode string + */ +char * aec_get_mode_string(aec_mode_t aec_mode); /** * @brief Free the AEC instance @@ -104,7 +96,7 @@ void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int * @return None * */ -void aec_destroy(aec_handle_t inst); +void aec_destroy(aec_handle_t *handel); #ifdef __cplusplus } diff --git a/include/esp32/esp_afe_config.h b/include/esp32/esp_afe_config.h index c32689d..694caa2 100644 --- a/include/esp32/esp_afe_config.h +++ b/include/esp32/esp_afe_config.h @@ -1,24 +1,41 @@ #pragma once #include "stdint.h" +#include "stdbool.h" +#include "stdlib.h" #include "esp_wn_iface.h" #include "esp_wn_models.h" #include "esp_vad.h" - +#include "esp_aec.h" +#include "esp_agc.h" +#include "model_path.h" +#include "esp_vadn_models.h" +#include "esp_nsn_models.h" #ifdef __cplusplus extern "C" { #endif //AFE: Audio Front-End //SR: Speech Recognition -//afe_sr/AFE_SR: the audio front-end for speech recognition - +//VC: Voice Communication //Set AFE_SR mode typedef enum { - SR_MODE_LOW_COST = 0, - SR_MODE_HIGH_PERF = 1 + SR_MODE_LOW_COST = 0, //Deprecated, please use afe_mode_t, AFE mode: low cost mode + SR_MODE_HIGH_PERF = 1, //Deprecated, please use afe_mode_t, AFE mode: high performance mode } afe_sr_mode_t; +//Set AFE mode +typedef enum { + AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode + AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode +} afe_mode_t; + +//Set AFE type +typedef enum { + AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression + AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression +} afe_type_t; + typedef enum { AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance @@ -26,24 +43,30 @@ typedef enum { } afe_memory_alloc_mode_t; typedef enum { - AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of audio fed to multinet is -9dB - AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of audio fed to multinet is -6dB - AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of audio fed to multinet is -3dB + AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB + AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB + AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain } afe_mn_peak_agc_mode_t; typedef struct { - int total_ch_num; // total channel num. It must be: total_ch_num = mic_num + ref_num - int mic_num; // mic channel num - int ref_num; // reference channel num - int sample_rate; // sample rate of audio + int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel + int mic_num; // microphone channel number + uint8_t* mic_ids; // microphone channel indices + int ref_num; // playback reference channel number + uint8_t* ref_ids; // playback reference channel indices + int sample_rate; // sample rate of audio } afe_pcm_config_t; typedef enum { - NS_MODE_SSP = 0, // speech signal process method - NS_MODE_NET = 1, // deep noise suppression net method + AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC" + AFE_NS_MODE_NET = 1, // please use model name of NSNET } afe_ns_mode_t; +typedef enum { + AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC + AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated +} afe_agc_mode_t; /** * @brief Function to get the debug audio data @@ -66,148 +89,192 @@ typedef struct { } afe_debug_hook_t; typedef struct { - bool aec_init; - bool se_init; - bool vad_init; + /********** AEC(Acoustic Echo Cancellation) **********/ + bool aec_init; // Whether to init aec + aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF + int aec_filter_length; // The filter length of aec + + /********** SE(Speech Enhancement, microphone array processing) **********/ + bool se_init; // Whether to init se + + /********** NS(Noise Suppression) **********/ + bool ns_init; // Whether to init ns + char *ns_model_name; // Model name of ns + afe_ns_mode_t afe_ns_mode; // Model mode of ns + + /********** VAD(Voice Activity Detection) **********/ + bool vad_init; // Whether to init vad + vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4 + char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used. + int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms + int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms + bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false + bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false + + /********** WakeNet(Wake Word Engine) **********/ bool wakenet_init; - bool voice_communication_init; - bool voice_communication_agc_init; // AGC swich for voice communication - int voice_communication_agc_gain; // AGC gain(dB) for voice communication - vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4 char *wakenet_model_name; // The model name of wakenet 1 char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2 - det_mode_t wakenet_mode; - afe_sr_mode_t afe_mode; - int afe_perferred_core; - int afe_perferred_priority; - int afe_ringbuf_size; - afe_memory_alloc_mode_t memory_alloc_mode; - float afe_linear_gain; // The linear gain for sr output(note: invaild for vc), the value should be in [0.1, 10.0]. - // This value acts directly on the output amplitude: out_linear_gain * amplitude. - afe_mn_peak_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain. + det_mode_t wakenet_mode; // The mode of wakenet + + /********** AGC(Automatic Gain Control) **********/ + bool agc_init; // Whether to init agc + afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain. + int agc_compression_gain_db; // Compression gain in dB (default 9) + int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3) + + /********** General AFE(Audio Front End) parameter **********/ afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function. + afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function. + int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function. + int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer. + afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM + float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude. bool debug_init; - afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX]; - afe_ns_mode_t afe_ns_mode; - char *afe_ns_model_name; bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone // otherwise, select channel number by wakenet - char *vad_model_name; // The model name of vad, support vadnet1 and vadnet1_small - int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms - int vad_min_noise_ms; // The minimum duration of noise/silence in ms. It should be bigger than 64 ms - bool vad_mute_playback; // If true, the playback will be muted for vad detection } afe_config_t; +/** + * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format. + * You can manually fine-tune it after creating the configuration + * + * The input format: + * M to represent the microphone channel + * R to represent the playback reference channel + * N to represent an unknown or unused channel + * + * For example, input_format="MMNR" indicates that the input data consists of four channels, + * which are the microphone channel, the microphone channel, an unused channel, and the playback channel + * + * @param input_format The input format + * @param models Models from partition, which is configured by Kconfig + * @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC + * @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + * + * @return afe_config_t* The default config of afe + */ +afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode); -#if CONFIG_IDF_TARGET_ESP32 -#define AFE_CONFIG_DEFAULT() { \ - .aec_init = true, \ - .se_init = true, \ - .vad_init = true, \ - .wakenet_init = true, \ - .voice_communication_init = false, \ - .voice_communication_agc_init = false, \ - .voice_communication_agc_gain = 15, \ - .vad_mode = VAD_MODE_3, \ - .wakenet_model_name = NULL, \ - .wakenet_model_name_2 = NULL, \ - .wakenet_mode = DET_MODE_90, \ - .afe_mode = SR_MODE_HIGH_PERF, \ - .afe_perferred_core = 0, \ - .afe_perferred_priority = 5, \ - .afe_ringbuf_size = 50, \ - .memory_alloc_mode = AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE, \ - .afe_linear_gain = 1.0, \ - .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \ - .pcm_config = { \ - .total_ch_num = 2, \ - .mic_num = 1, \ - .ref_num = 1, \ - .sample_rate = 16000, \ - }, \ - .debug_init = false, \ - .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \ - .afe_ns_mode = NS_MODE_SSP, \ - .afe_ns_model_name = NULL, \ - .fixed_first_channel = true, \ - .vad_model_name = NULL, \ - .vad_min_speech_ms = 64, \ - .vad_min_noise_ms = 256, \ - .vad_mute_playback = false, \ -} -#elif CONFIG_IDF_TARGET_ESP32P4 -#define AFE_CONFIG_DEFAULT() { \ - .aec_init = true, \ - .se_init = true, \ - .vad_init = true, \ - .wakenet_init = true, \ - .voice_communication_init = false, \ - .voice_communication_agc_init = false, \ - .voice_communication_agc_gain = 15, \ - .vad_mode = VAD_MODE_3, \ - .wakenet_model_name = NULL, \ - .wakenet_model_name_2 = NULL, \ - .wakenet_mode = DET_MODE_90, \ - .afe_mode = SR_MODE_LOW_COST, \ - .afe_perferred_core = 0, \ - .afe_perferred_priority = 5, \ - .afe_ringbuf_size = 50, \ - .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \ - .afe_linear_gain = 1.0, \ - .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \ - .pcm_config = { \ - .total_ch_num = 2, \ - .mic_num = 1, \ - .ref_num = 1, \ - .sample_rate = 16000, \ - }, \ - .debug_init = false, \ - .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \ - .afe_ns_mode = NS_MODE_SSP, \ - .afe_ns_model_name = NULL, \ - .fixed_first_channel = true, \ - .vad_model_name = NULL, \ - .vad_min_speech_ms = 64, \ - .vad_min_noise_ms = 256, \ - .vad_mute_playback = false, \ -} -#elif CONFIG_IDF_TARGET_ESP32S3 -#define AFE_CONFIG_DEFAULT() { \ - .aec_init = true, \ - .se_init = true, \ - .vad_init = true, \ - .wakenet_init = true, \ - .voice_communication_init = false, \ - .voice_communication_agc_init = false, \ - .voice_communication_agc_gain = 15, \ - .vad_mode = VAD_MODE_3, \ - .wakenet_model_name = NULL, \ - .wakenet_model_name_2 = NULL, \ - .wakenet_mode = DET_MODE_2CH_90, \ - .afe_mode = SR_MODE_LOW_COST, \ - .afe_perferred_core = 0, \ - .afe_perferred_priority = 5, \ - .afe_ringbuf_size = 50, \ - .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \ - .afe_linear_gain = 1.0, \ - .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \ - .pcm_config = { \ - .total_ch_num = 3, \ - .mic_num = 2, \ - .ref_num = 1, \ - .sample_rate = 16000, \ - }, \ - .debug_init = false, \ - .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \ - .afe_ns_mode = NS_MODE_SSP, \ - .afe_ns_model_name = NULL, \ - .fixed_first_channel = true, \ - .vad_model_name = NULL, \ - .vad_min_speech_ms = 64, \ - .vad_min_noise_ms = 256, \ - .vad_mute_playback = false, \ -} -#endif +/** + * @brief Check AFE configuration and make sure it is correct. + * + * @warning If there is a configuration conflict, this function will modify some parameters. + * The guiding behind these modifications is to maintain the highest performance of the output audio and results. + * And remove the conflict between different algorithms. + * + * For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm. + * If SE(BSS) algorithm is deactivated, will only use the first microphone channel. + * + * @param afe_config Input AFE config + * + * @return afe_config_t* The modified AFE config + */ +afe_config_t *afe_config_check(afe_config_t *afe_config); + +/** + * @brief Parse input format + * + * @param input_format The input format, same with afe_config_init() function + * @param pcm_config The pcm config + * + * @return true if the input format is parsed successfully, otherwise false + */ +bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config); + +/** + * @brief Parse I2S input data + * + * @param data The input multi channel data + * @param frame_size The frame size of input, it is also the size of single channel data + * @param mic_data The output microphone data + * @param ref_data The output playback reference data + * @param pcm_config The pcm config + * + */ +void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config); + +/** + * @brief Parse input data, from interleaved arrangement to contiguous arrangement + * + * @param data The input multi channel data + * @param frame_size The frame size of input, it is also the size of single channel data + * @param channel_num The channel number of data + * @param out_data The output data + * + */ +void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data); + +/** + * @brief Format input data, from contiguous arrangement to interleaved arrangement + * + * @param data The input multi channel data + * @param frame_size The frame size of input, it is also the size of single channel data + * @param channel_num The channel number of data + * @param out_data The output data + * + */ +void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data); + +/** + * @brief Adjust the gain of input data + * + * @warning the input data will be modified inplace. + * + * @param data The input audio data + * @param frame_size The frame size of input, it is also the size of single channel data + * @param factor The gain factor + * + * @return int16_t* The output audio data + */ +int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor); + +/** + * @brief Adjust the gain of input data + * + * @warning the input data will be modified inplace. + * + * @param in_data The input audio data + * @param in_frame_size Input data frame size of input + * @param channel_num The channel number of input data, which is same as output data + * @param out_data The output audio data + * @param out_frame_size Onput data frame size of input + * + */ +void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size); + +/** + * @brief Copy the afe config + * + * @param dst_config The destination afe config + * @param src_config The source afe config + * + * @return The destination afe config + */ +afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config); + +/** + * @brief Print the afe config + * + * @param afe_config The afe config + */ +void afe_config_print(const afe_config_t *afe_config); + +/** + * @brief Allocate afe config + * + * @return The afe config pointer + */ +afe_config_t *afe_config_alloc(); + +/** + * @brief Free afe config + * + * @param afe_config The afe config pointer + */ +void afe_config_free(afe_config_t *afe_config); #ifdef __cplusplus } diff --git a/include/esp32/esp_afe_sr_iface.h b/include/esp32/esp_afe_sr_iface.h index 84d7000..f434c3e 100644 --- a/include/esp32/esp_afe_sr_iface.h +++ b/include/esp32/esp_afe_sr_iface.h @@ -1,7 +1,10 @@ #pragma once #include "stdint.h" +#include "stdlib.h" +#include "stdbool.h" #include "esp_afe_config.h" - +#include "freertos/FreeRTOS.h" +#include "freertos/task.h" #ifdef __cplusplus extern "C" { #endif @@ -13,13 +16,15 @@ extern "C" { //Opaque AFE_SR data container typedef struct esp_afe_sr_data_t esp_afe_sr_data_t; + + /** * @brief The state of vad */ typedef enum { - AFE_VAD_SILENCE = 0, // noise or silence - AFE_VAD_SPEECH // speech + AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence + AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech } afe_vad_state_t; /** @@ -27,7 +32,7 @@ typedef enum */ typedef struct afe_fetch_result_t { - int16_t *data; // the data of audio. + int16_t *data; // the target channel data of audio. int data_size; // the size of data. The unit is byte. int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated. int vad_cache_size; // the size of vad_cache. The unit is byte. @@ -36,10 +41,12 @@ typedef struct afe_fetch_result_t wakenet_state_t wakeup_state; // the value is wakenet_state_t int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1. int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1. - afe_vad_state_t vad_state; // the value is afe_vad_state_t + vad_state_t vad_state; // the value is afe_vad_state_t int trigger_channel_id; // the channel index of output int wake_word_length; // the length of wake word. The unit is the number of samples. int ret_value; // the return state of fetch function + int16_t *raw_data; // the multi-channel output data of audio. + int raw_data_channels; // the channel number of raw data void* reserved; // reserved for future use } afe_fetch_result_t; @@ -63,19 +70,11 @@ typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_confi typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe); /** - * @brief Get the total channel number which be config + * @brief Get the channel number * * @param afe The AFE_SR object to query * @return The amount of total channels */ -typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe); - -/** - * @brief Get the mic channel number which be config - * - * @param afe The AFE_SR object to query - * @return The amount of mic channels - */ typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe); /** @@ -104,12 +103,24 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* * @brief fetch enhanced samples of an audio stream from the AFE_SR * * @Warning The output is single channel data, no matter how many channels the input is. + * Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`. * * @param afe The AFE_SR object to query * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.) */ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe); +/** + * @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch` + * + * @Warning The output is single channel data, no matter how many channels the input is. + * + * @param afe The AFE_SR object to query + * @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result. + * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.) + */ +typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait); + /** * @brief reset ringbuf of AFE. * @@ -129,52 +140,37 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe); typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name); /** - * @brief Disable wakenet model. + * @brief Enable VAD algorithm. * * @param afe The AFE_SR object to query * @return -1: fail, 0: disabled, 1: enabled */ -typedef int (*esp_afe_sr_iface_op_disable_wakenet_t)(esp_afe_sr_data_t *afe); +typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe); /** - * @brief Enable wakenet model. + * @brief Disable one function/module/algorithm. * * @param afe The AFE_SR object to query * @return -1: fail, 0: disabled, 1: enabled */ -typedef int (*esp_afe_sr_iface_op_enable_wakenet_t)(esp_afe_sr_data_t *afe); +typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe); /** - * @brief Disable AEC algorithm. + * @brief Enable one function/module/algorithm. * * @param afe The AFE_SR object to query * @return -1: fail, 0: disabled, 1: enabled */ -typedef int (*esp_afe_sr_iface_op_disable_aec_t)(esp_afe_sr_data_t *afe); +typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe); /** - * @brief Enable AEC algorithm. + * @brief Print all functions/modules/algorithms pipeline. + * The pipeline is the order of the functions/modules/algorithms. + * The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output] * * @param afe The AFE_SR object to query - * @return -1: fail, 0: disabled, 1: enabled */ -typedef int (*esp_afe_sr_iface_op_enable_aec_t)(esp_afe_sr_data_t *afe); - -/** - * @brief Disable SE algorithm. - * - * @param afe The AFE_SR object to query - * @return -1: fail, 0: disabled, 1: enabled - */ -typedef int (*esp_afe_sr_iface_op_disable_se_t)(esp_afe_sr_data_t *afe); - -/** - * @brief Enable SE algorithm. - * - * @param afe The AFE_SR object to query - * @return -1: fail, 0: disabled, 1: enabled - */ -typedef int (*esp_afe_sr_iface_op_enable_se_t)(esp_afe_sr_data_t *afe); +typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe); /** * @brief Destroy a AFE_SR instance @@ -191,22 +187,41 @@ typedef struct { esp_afe_sr_iface_op_create_from_config_t create_from_config; esp_afe_sr_iface_op_feed_t feed; esp_afe_sr_iface_op_fetch_t fetch; + esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay; esp_afe_sr_iface_op_reset_buffer_t reset_buffer; esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize; esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize; - esp_afe_sr_iface_op_get_total_channel_num_t get_total_channel_num; - esp_afe_sr_iface_op_get_channel_num_t get_channel_num; + esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num + esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num; + esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num; esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate; esp_afe_sr_iface_op_set_wakenet_t set_wakenet; - esp_afe_sr_iface_op_disable_wakenet_t disable_wakenet; - esp_afe_sr_iface_op_enable_wakenet_t enable_wakenet; - esp_afe_sr_iface_op_disable_aec_t disable_aec; - esp_afe_sr_iface_op_enable_aec_t enable_aec; - esp_afe_sr_iface_op_disable_se_t disable_se; - esp_afe_sr_iface_op_enable_se_t enable_se; + esp_afe_sr_iface_op_disable_func_t disable_wakenet; + esp_afe_sr_iface_op_enable_func_t enable_wakenet; + esp_afe_sr_iface_op_disable_func_t disable_aec; + esp_afe_sr_iface_op_enable_func_t enable_aec; + esp_afe_sr_iface_op_disable_func_t disable_se; + esp_afe_sr_iface_op_enable_func_t enable_se; + esp_afe_sr_iface_op_disable_func_t disable_vad; + esp_afe_sr_iface_op_enable_func_t enable_vad; + esp_afe_sr_iface_op_disable_func_t disable_ns; + esp_afe_sr_iface_op_enable_func_t enable_ns; + esp_afe_sr_iface_op_disable_func_t disable_agc; + esp_afe_sr_iface_op_enable_func_t enable_agc; + esp_afe_sr_iface_op_print_pipeline_t print_pipeline; esp_afe_sr_iface_op_destroy_t destroy; } esp_afe_sr_iface_t; + +// struct is used to store the AFE handle and data for the AFE task +typedef struct +{ + esp_afe_sr_data_t *afe_data; + esp_afe_sr_iface_t *afe_handle; + TaskHandle_t feed_task; + TaskHandle_t fetch_task; +}afe_task_into_t; + #ifdef __cplusplus } #endif \ No newline at end of file diff --git a/include/esp32/esp_afe_sr_models.h b/include/esp32/esp_afe_sr_models.h index 39de63f..05a08d3 100644 --- a/include/esp32/esp_afe_sr_models.h +++ b/include/esp32/esp_afe_sr_models.h @@ -6,17 +6,7 @@ extern "C" { #include "esp_afe_sr_iface.h" - -#if CONFIG_AFE_INTERFACE_V1 -extern const esp_afe_sr_iface_t esp_afe_sr_v1; -extern const esp_afe_sr_iface_t esp_afe_vc_v1; -#define ESP_AFE_SR_HANDLE esp_afe_sr_v1 -#define ESP_AFE_VC_HANDLE esp_afe_vc_v1 - -#else -#error No valid afe selected. -#endif - +esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config); #ifdef __cplusplus } diff --git a/include/esp32/esp_agc.h b/include/esp32/esp_agc.h index 76d3015..8ea1c05 100644 --- a/include/esp32/esp_agc.h +++ b/include/esp32/esp_agc.h @@ -26,8 +26,15 @@ typedef enum { ESP_AGC_FRAME_SIZE_ERROR = -3, ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size } ESP_AGE_ERR; +typedef enum { + AGC_MODE_SR = -1, // Bypass WEBRTC AGC + AGC_MODE_0 = 0, // Only saturation protection + AGC_MODE_1 = 1, // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)] + AGC_MODE_2 = 2, // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)] + AGC_MODE_3 = 3, // Fixed Digital Gain [compressionGaindB (default 8 dB)] +} agc_mode_t; -void *esp_agc_open(int agc_mode, int sample_rate); +void *esp_agc_open(agc_mode_t agc_mode, int sample_rate); void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs); int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate); void esp_agc_close(void *agc_handle); diff --git a/include/esp32/esp_vad.h b/include/esp32/esp_vad.h index 90f8e20..f3c5dd4 100644 --- a/include/esp32/esp_vad.h +++ b/include/esp32/esp_vad.h @@ -78,6 +78,8 @@ vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state); typedef struct { vad_trigger_t *trigger; void *vad_inst; + int sample_rate; + int frame_size; }vad_handle_with_trigger_t; typedef vad_handle_with_trigger_t* vad_handle_t; @@ -100,31 +102,41 @@ vad_handle_t vad_create(vad_mode_t vad_mode); * @brief Creates an instance to the VAD structure. * * @param vad_mode Sets the VAD operating mode. - * @param min_speech_len Minimum frame number of speech duration - * @param min_noise_len Minimum frame number of noise duration + * @param sample_rate Sample rate in Hz + * @param one_frame_ms Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30. + * @param min_speech_ms Minimum speech duration, unit is ms + * @param min_noise_ms Minimum noise duration, unit is ms * @return * - NULL: Create failed * - Others: The instance of VAD */ -vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len); +vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len); /** * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. * - * @param inst The instance of VAD. - * - * @param data An array of 16-bit signed audio samples. - * + * @param handle The instance of VAD. + * @param data An array of 16-bit signed audio samples. * @param sample_rate_hz The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000. - * * @param one_frame_ms The length of the audio processing can be 10ms, 20ms, 30ms, default: 30. - * * @return * - VAD_SILENCE if no voice * - VAD_SPEECH if voice is detected * */ -vad_state_t vad_process(vad_handle_t inst, int16_t *data, int sample_rate_hz, int one_frame_ms); +vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms); + +/** + * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. + * + * @param handle The instance of VAD. + * @param data An array of 16-bit signed audio samples. + * @return + * - VAD_SILENCE if no voice + * - VAD_SPEECH if voice is detected + * + */ +vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data); /** * @brief Free the VAD instance diff --git a/include/esp32/esp_vadn_iface.h b/include/esp32/esp_vadn_iface.h new file mode 100644 index 0000000..bc2860f --- /dev/null +++ b/include/esp32/esp_vadn_iface.h @@ -0,0 +1,164 @@ +#pragma once +#include "esp_vad.h" +#include "stdint.h" +#include "dl_lib_convq_queue.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Opaque model data container +typedef struct model_iface_data_t model_iface_data_t; + +// /** +// * @brief The state of vad +// */ +// typedef enum { +// VAD_NOISE = -1, // Noise +// VADNET_STATE_SILENCE = 0, // Silence +// VAD_SPEECH = 1 // Speech +// } vad_state_t; + +/** + * @brief Easy function type to initialze a model instance with a detection mode + * and specified model name + * + * @param model_name The specified model name + * @param mode The voice activity detection mode + * @param channel_num The number of input audio channels + * @param min_speech_ms The minimum duration of speech in ms to trigger vad + * speech + * @param min_noise_ms The minimum duration of noise in ms to trigger vad + * noise + * @returns Handle to the model data + */ +typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)( + const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms); + +/** + * @brief Get the amount of samples that need to be passed to the detect + * function + * + * Every speech recognition model processes a certain number of samples at the + * same time. This function can be used to query that amount. Note that the + * returned amount is in 16-bit samples, not in bytes. + * + * @param model The model object to query + * @return The amount of samples to feed the detect function + */ +typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model); + +/** + * @brief Get the channel number of samples that need to be passed to the detect + * function + * + * Every speech recognition model processes a certain number of samples at the + * same time. This function can be used to query that amount. Note that the + * returned amount is in 16-bit samples, not in bytes. + * + * @param model The model object to query + * @return The amount of samples to feed the detect function + */ +typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model); + +/** + * @brief Get the sample rate of the samples to feed to the detect function + * + * @param model The model object to query + * @return The sample rate, in hz + */ +typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model); + +/** + * @brief Set the detection threshold to manually abjust the probability + * + * @param model The model object to query + * @param det_treshold The threshold to trigger wake words, the range of + * det_threshold is 0.5~0.9999 + * @return 0: setting failed, 1: setting success + */ +typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold); + +/** + * @brief Get the voice activity detection threshold + * + * @param model The model object to query + * @returns the detection threshold + */ +typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model); + +/** + * @brief Feed samples of an audio stream to the vad model and detect whether is + * voice. + * + * @param model The model object to query + * @param samples An array of 16-bit signed audio samples. The array size used + * can be queried by the get_samp_chunksize function. + * @return The index of wake words, return 0 if no wake word is detected, else + * the index of the wake words. + */ +typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples); + +/** + * @brief Feed MFCC of an audio stream to the vad model and detect whether is + * voice. + * + * @param model The model object to query + * @param cq An array of 16-bit MFCC. + * @return The index of wake words, return 0 if no wake word is detected, else + * the index of the wake words. + */ +typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq); + +/** + * @brief Get MFCC of an audio stream + * + * @param model The model object to query + * @return MFCC data + */ +typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model); + +/** + * @brief Get the triggered channel index. Channel index starts from zero + * + * @param model The model object to query + * @return The channel index + */ +typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model); + +/** + * @brief Clean all states of model + * + * @param model The model object to query + */ +typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model); + +/** + * @brief Destroy a model object + * + * @param model Model object to destroy + */ +typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model); + +/** + * This structure contains the functions used to do operations on a voice + * activity detection model. + */ +typedef struct { + esp_vadn_iface_op_create_t create; + esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize; + esp_vadn_iface_op_get_channel_num_t get_channel_num; + esp_vadn_iface_op_get_samp_rate_t get_samp_rate; + esp_vadn_iface_op_set_det_threshold_t set_det_threshold; + esp_vadn_iface_op_get_det_threshold_t get_det_threshold; + esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel; + esp_vadn_iface_op_detect_t detect; + esp_vadn_iface_op_detect_mfcc_t detect_mfcc; + esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data; + esp_vadn_iface_op_clean_t clean; + esp_vadn_iface_op_destroy_t destroy; +} esp_vadn_iface_t; + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/include/esp32/esp_vadn_models.h b/include/esp32/esp_vadn_models.h new file mode 100644 index 0000000..eadc55f --- /dev/null +++ b/include/esp32/esp_vadn_models.h @@ -0,0 +1,22 @@ +#pragma once +#include "esp_vadn_iface.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// The prefix of vadnet model name is used to filter all wakenet from availabel models. +#define ESP_VADN_PREFIX "vadnet" + +/** + * @brief Get the wakenet handle from model name + * + * @param model_name The name of model + * @returns The handle of wakenet + */ +const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name); + + +#ifdef __cplusplus +} +#endif diff --git a/include/esp32/esp_webrtc.h b/include/esp32/esp_webrtc.h new file mode 100644 index 0000000..0b85bdd --- /dev/null +++ b/include/esp32/esp_webrtc.h @@ -0,0 +1,90 @@ +// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License +#ifndef _ESP_WEBRTC_H_ +#define _ESP_WEBRTC_H_ + + +#ifdef __cplusplus +extern "C" { +#endif +#include +#include "sr_ringbuf.h" +#include "esp_log.h" +#include "esp_agc.h" +#include "esp_ns.h" + +#include "esp_heap_caps.h" + +typedef struct { + void* ns_handle; + void* agc_handle; + int frame_size; + int sample_rate; + int16_t *buff; + int16_t *out_data; + sr_ringbuf_handle_t rb; +}webrtc_handle_t; + +/** + * @brief Creates an instance of webrtc. + * + * @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms. + * + * @param frame_length_ms The length of the audio processing + * @param ns_mode The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive + * @param agc_mode The model of AGC + * @param agc_gain The gain of AGC. default is 9 + * @param agc_target_level The target level of AGC. default is -3 dbfs + * @param sample_rate The sample rate of the audio. + * + * @return + * - NULL: Create failed + * - Others: The instance of webrtc + */ +webrtc_handle_t* webrtc_create( + int frame_length_ms, + int ns_mode, + agc_mode_t agc_mode, + int agc_gain, + int agc_target_level, + int sample_rate); + +/** + * @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression. + * + * @param handle The instance of NS. + * @param in_data An array of 16-bit signed audio samples. + * @param out_size The sample size of output data + * @param enable_ns Enable noise suppression + * @param enable_agc Enable automatic gain control + * + * @return data after noise suppression + */ +int16_t* webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc); + +/** + * @brief Free the webrtc instance + * + * @param handle The instance of webrtc. + * + * @return None + * + */ +void webrtc_destroy(webrtc_handle_t *handle); + +#ifdef __cplusplus +} +#endif + +#endif //_ESP_NS_H_ diff --git a/include/esp32/esp_wn_iface.h b/include/esp32/esp_wn_iface.h index bbcdcb9..44bab8d 100644 --- a/include/esp32/esp_wn_iface.h +++ b/include/esp32/esp_wn_iface.h @@ -1,5 +1,6 @@ #pragma once #include "stdint.h" +#include "dl_lib_convq_queue.h" #ifdef __cplusplus extern "C" { @@ -167,6 +168,25 @@ typedef void (*esp_wn_iface_op_clean_t)(model_iface_data_t *model); */ typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model); +/** + * @brief Feed MFCC of an audio stream to the vad model and detect whether is + * voice. + * + * @param model The model object to query + * @param cq An array of 16-bit MFCC. + * @return The index of wake words, return 0 if no wake word is detected, else + * the index of the wake words. + */ +typedef wakenet_state_t (*esp_wn_iface_op_detect_mfcc_t)(model_iface_data_t *model, int16_t *samples, dl_convq_queue_t *cq); + +/** + * @brief Get MFCC of an audio stream + * + * @param model The model object to query + * @return MFCC data + */ +typedef dl_convq_queue_t* (*esp_wn_iface_op_get_mfcc_data_t)(model_iface_data_t *model); + /** * This structure contains the functions used to do operations on a wake word detection model. @@ -184,6 +204,8 @@ typedef struct { esp_wn_iface_op_get_triggered_channel_t get_triggered_channel; esp_wn_iface_op_get_vol_gain_t get_vol_gain; esp_wn_iface_op_detect_t detect; + esp_wn_iface_op_detect_mfcc_t detect_mfcc; + esp_wn_iface_op_get_mfcc_data_t get_mfcc_data; esp_wn_iface_op_clean_t clean; esp_wn_iface_op_destroy_t destroy; } esp_wn_iface_t; diff --git a/include/esp32p4/esp_aec.h b/include/esp32p4/esp_aec.h index deb031c..36de9c1 100644 --- a/include/esp32p4/esp_aec.h +++ b/include/esp32p4/esp_aec.h @@ -21,80 +21,72 @@ extern "C" { #endif #define USE_AEC_FFT // Not kiss_fft -#define AEC_USE_SPIRAM 0 #define AEC_SAMPLE_RATE 16000 // Only Support 16000Hz -//#define AEC_FRAME_LENGTH_MS 16 #define AEC_FRAME_LENGTH_MS 32 -#define AEC_FILTER_LENGTH 1200 // Number of samples of echo to cancel -typedef void* aec_handle_t; +typedef struct aec_handle_t aec_handle_t; +typedef enum { + AEC_MODE_SR_LOW_COST = 0, // Low Cost AEC fro speech recognition + AEC_MODE_SR_HIGH_PERF = 1, // High Perforamce AEC for speech recognition + AEC_MODE_VOIP_LOW_COST = 3, // Low Cost AEC for voice communication + AEC_MODE_VOIP_HIGH_PERF = 4, // High Perforamce AEC for voice communication +} aec_mode_t; /** * @brief Creates an instance to the AEC structure. + * Please get frame size by aec_get_chunksize() function * - * @deprecated This API will be deprecated after version 1.0, please use aec_pro_create - * * @param sample_rate The Sampling frequency (Hz) must be 16000. - * - * @param frame_length The length of the audio processing must be 16ms. - * - * @param filter_length Number of samples of echo to cancel. - * + * @param filter_length Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption. + * @param channel_num The input microphone channel number + * @param mode The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST * @return * - NULL: Create failed * - Others: The instance of AEC */ -aec_handle_t aec_create(int sample_rate, int frame_length, int filter_length); +aec_handle_t *aec_create(int sample_rate, int filter_length, int channel_num, aec_mode_t mode); /** - * @brief Creates an instance to the AEC structure. + * @brief Creates an instance to the AEC structure, same with aec_create(). * - * @deprecated This API will be deprecated after version 1.0, please use aec_pro_create - * - * @param sample_rate The Sampling frequency (Hz) must be 16000. - * - * @param frame_length The length of the audio processing must be 16ms. - * - * @param filter_length Number of samples of echo to cancel. - * - * @param nch Number of input signal channel. - * + * @param filter_length Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption. + * @param channel_num The input microphone channel number + * @param mode The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST * @return * - NULL: Create failed * - Others: The instance of AEC */ -aec_handle_t aec_create_multimic(int sample_rate, int frame_length, int filter_length, int nch); - -/** - * @brief Creates an instance of more powerful AEC. - * - * @param frame_length Length of input signal. Must be 16ms if mode is 0; otherwise could be 16ms or 32ms. Length of input signal to aec_process must be modified accordingly. - * - * @param nch Number of microphones. - * - * @param mode Mode of AEC (0 to 5), indicating aggressiveness and RAM allocation. 0: mild; 1 or 2: medium (1: internal RAM, 2: SPIRAM); 3 and 4: aggressive (3: internal RAM, 4: SPIRAM); 5: agressive, accelerated for ESP32-S3. - * - * @return - * - NULL: Create failed - * - Others: An Instance of AEC - */ -aec_handle_t aec_pro_create(int frame_length, int nch, int mode); +aec_handle_t *aec_pro_create(int filter_length, int channel_num, aec_mode_t mode); /** * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic. * - * @param inst The instance of AEC. - * + * @warning The indata, refdata and outdata must be 16-bit signed. please allocate memory by heap_caps_aligned_alloc(). + * + * @param inst The instance of AEC. Format for multi-channel data is "ch0 ch0 ch0 ..., ch1 ch1 ch1 ..." * @param indata An array of 16-bit signed audio samples from mic. - * * @param refdata An array of 16-bit signed audio samples sent to the speaker. - * - * @param outdata Returns near-end signal with echo removed. - * + * @param outdata Returns near-end signal with echo removed. Format for multi-channel data is "ch0 ch0 ch0..., ch1 ch1 ch1 ..." * @return None * */ -void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int16_t *outdata); +void aec_process(const aec_handle_t *handel, int16_t *indata, int16_t *refdata, int16_t *outdata); + +/** + * @brief Get frame size of AEC (the samples of one frame) + * @param handle The instance of AEC. + * @return Frame size + */ +int aec_get_chunksize(const aec_handle_t *handle); + +/** + * @brief Get AEC mode string + * + * @param aec_mode The mode of AEC. + * + * @return AEC mode string + */ +char * aec_get_mode_string(aec_mode_t aec_mode); /** * @brief Free the AEC instance @@ -104,7 +96,7 @@ void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int * @return None * */ -void aec_destroy(aec_handle_t inst); +void aec_destroy(aec_handle_t *handel); #ifdef __cplusplus } diff --git a/include/esp32p4/esp_afe_config.h b/include/esp32p4/esp_afe_config.h index c32689d..694caa2 100644 --- a/include/esp32p4/esp_afe_config.h +++ b/include/esp32p4/esp_afe_config.h @@ -1,24 +1,41 @@ #pragma once #include "stdint.h" +#include "stdbool.h" +#include "stdlib.h" #include "esp_wn_iface.h" #include "esp_wn_models.h" #include "esp_vad.h" - +#include "esp_aec.h" +#include "esp_agc.h" +#include "model_path.h" +#include "esp_vadn_models.h" +#include "esp_nsn_models.h" #ifdef __cplusplus extern "C" { #endif //AFE: Audio Front-End //SR: Speech Recognition -//afe_sr/AFE_SR: the audio front-end for speech recognition - +//VC: Voice Communication //Set AFE_SR mode typedef enum { - SR_MODE_LOW_COST = 0, - SR_MODE_HIGH_PERF = 1 + SR_MODE_LOW_COST = 0, //Deprecated, please use afe_mode_t, AFE mode: low cost mode + SR_MODE_HIGH_PERF = 1, //Deprecated, please use afe_mode_t, AFE mode: high performance mode } afe_sr_mode_t; +//Set AFE mode +typedef enum { + AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode + AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode +} afe_mode_t; + +//Set AFE type +typedef enum { + AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression + AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression +} afe_type_t; + typedef enum { AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance @@ -26,24 +43,30 @@ typedef enum { } afe_memory_alloc_mode_t; typedef enum { - AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of audio fed to multinet is -9dB - AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of audio fed to multinet is -6dB - AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of audio fed to multinet is -3dB + AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB + AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB + AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain } afe_mn_peak_agc_mode_t; typedef struct { - int total_ch_num; // total channel num. It must be: total_ch_num = mic_num + ref_num - int mic_num; // mic channel num - int ref_num; // reference channel num - int sample_rate; // sample rate of audio + int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel + int mic_num; // microphone channel number + uint8_t* mic_ids; // microphone channel indices + int ref_num; // playback reference channel number + uint8_t* ref_ids; // playback reference channel indices + int sample_rate; // sample rate of audio } afe_pcm_config_t; typedef enum { - NS_MODE_SSP = 0, // speech signal process method - NS_MODE_NET = 1, // deep noise suppression net method + AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC" + AFE_NS_MODE_NET = 1, // please use model name of NSNET } afe_ns_mode_t; +typedef enum { + AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC + AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated +} afe_agc_mode_t; /** * @brief Function to get the debug audio data @@ -66,148 +89,192 @@ typedef struct { } afe_debug_hook_t; typedef struct { - bool aec_init; - bool se_init; - bool vad_init; + /********** AEC(Acoustic Echo Cancellation) **********/ + bool aec_init; // Whether to init aec + aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF + int aec_filter_length; // The filter length of aec + + /********** SE(Speech Enhancement, microphone array processing) **********/ + bool se_init; // Whether to init se + + /********** NS(Noise Suppression) **********/ + bool ns_init; // Whether to init ns + char *ns_model_name; // Model name of ns + afe_ns_mode_t afe_ns_mode; // Model mode of ns + + /********** VAD(Voice Activity Detection) **********/ + bool vad_init; // Whether to init vad + vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4 + char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used. + int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms + int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms + bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false + bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false + + /********** WakeNet(Wake Word Engine) **********/ bool wakenet_init; - bool voice_communication_init; - bool voice_communication_agc_init; // AGC swich for voice communication - int voice_communication_agc_gain; // AGC gain(dB) for voice communication - vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4 char *wakenet_model_name; // The model name of wakenet 1 char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2 - det_mode_t wakenet_mode; - afe_sr_mode_t afe_mode; - int afe_perferred_core; - int afe_perferred_priority; - int afe_ringbuf_size; - afe_memory_alloc_mode_t memory_alloc_mode; - float afe_linear_gain; // The linear gain for sr output(note: invaild for vc), the value should be in [0.1, 10.0]. - // This value acts directly on the output amplitude: out_linear_gain * amplitude. - afe_mn_peak_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain. + det_mode_t wakenet_mode; // The mode of wakenet + + /********** AGC(Automatic Gain Control) **********/ + bool agc_init; // Whether to init agc + afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain. + int agc_compression_gain_db; // Compression gain in dB (default 9) + int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3) + + /********** General AFE(Audio Front End) parameter **********/ afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function. + afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function. + int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function. + int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer. + afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM + float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude. bool debug_init; - afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX]; - afe_ns_mode_t afe_ns_mode; - char *afe_ns_model_name; bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone // otherwise, select channel number by wakenet - char *vad_model_name; // The model name of vad, support vadnet1 and vadnet1_small - int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms - int vad_min_noise_ms; // The minimum duration of noise/silence in ms. It should be bigger than 64 ms - bool vad_mute_playback; // If true, the playback will be muted for vad detection } afe_config_t; +/** + * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format. + * You can manually fine-tune it after creating the configuration + * + * The input format: + * M to represent the microphone channel + * R to represent the playback reference channel + * N to represent an unknown or unused channel + * + * For example, input_format="MMNR" indicates that the input data consists of four channels, + * which are the microphone channel, the microphone channel, an unused channel, and the playback channel + * + * @param input_format The input format + * @param models Models from partition, which is configured by Kconfig + * @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC + * @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + * + * @return afe_config_t* The default config of afe + */ +afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode); -#if CONFIG_IDF_TARGET_ESP32 -#define AFE_CONFIG_DEFAULT() { \ - .aec_init = true, \ - .se_init = true, \ - .vad_init = true, \ - .wakenet_init = true, \ - .voice_communication_init = false, \ - .voice_communication_agc_init = false, \ - .voice_communication_agc_gain = 15, \ - .vad_mode = VAD_MODE_3, \ - .wakenet_model_name = NULL, \ - .wakenet_model_name_2 = NULL, \ - .wakenet_mode = DET_MODE_90, \ - .afe_mode = SR_MODE_HIGH_PERF, \ - .afe_perferred_core = 0, \ - .afe_perferred_priority = 5, \ - .afe_ringbuf_size = 50, \ - .memory_alloc_mode = AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE, \ - .afe_linear_gain = 1.0, \ - .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \ - .pcm_config = { \ - .total_ch_num = 2, \ - .mic_num = 1, \ - .ref_num = 1, \ - .sample_rate = 16000, \ - }, \ - .debug_init = false, \ - .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \ - .afe_ns_mode = NS_MODE_SSP, \ - .afe_ns_model_name = NULL, \ - .fixed_first_channel = true, \ - .vad_model_name = NULL, \ - .vad_min_speech_ms = 64, \ - .vad_min_noise_ms = 256, \ - .vad_mute_playback = false, \ -} -#elif CONFIG_IDF_TARGET_ESP32P4 -#define AFE_CONFIG_DEFAULT() { \ - .aec_init = true, \ - .se_init = true, \ - .vad_init = true, \ - .wakenet_init = true, \ - .voice_communication_init = false, \ - .voice_communication_agc_init = false, \ - .voice_communication_agc_gain = 15, \ - .vad_mode = VAD_MODE_3, \ - .wakenet_model_name = NULL, \ - .wakenet_model_name_2 = NULL, \ - .wakenet_mode = DET_MODE_90, \ - .afe_mode = SR_MODE_LOW_COST, \ - .afe_perferred_core = 0, \ - .afe_perferred_priority = 5, \ - .afe_ringbuf_size = 50, \ - .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \ - .afe_linear_gain = 1.0, \ - .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \ - .pcm_config = { \ - .total_ch_num = 2, \ - .mic_num = 1, \ - .ref_num = 1, \ - .sample_rate = 16000, \ - }, \ - .debug_init = false, \ - .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \ - .afe_ns_mode = NS_MODE_SSP, \ - .afe_ns_model_name = NULL, \ - .fixed_first_channel = true, \ - .vad_model_name = NULL, \ - .vad_min_speech_ms = 64, \ - .vad_min_noise_ms = 256, \ - .vad_mute_playback = false, \ -} -#elif CONFIG_IDF_TARGET_ESP32S3 -#define AFE_CONFIG_DEFAULT() { \ - .aec_init = true, \ - .se_init = true, \ - .vad_init = true, \ - .wakenet_init = true, \ - .voice_communication_init = false, \ - .voice_communication_agc_init = false, \ - .voice_communication_agc_gain = 15, \ - .vad_mode = VAD_MODE_3, \ - .wakenet_model_name = NULL, \ - .wakenet_model_name_2 = NULL, \ - .wakenet_mode = DET_MODE_2CH_90, \ - .afe_mode = SR_MODE_LOW_COST, \ - .afe_perferred_core = 0, \ - .afe_perferred_priority = 5, \ - .afe_ringbuf_size = 50, \ - .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \ - .afe_linear_gain = 1.0, \ - .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \ - .pcm_config = { \ - .total_ch_num = 3, \ - .mic_num = 2, \ - .ref_num = 1, \ - .sample_rate = 16000, \ - }, \ - .debug_init = false, \ - .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \ - .afe_ns_mode = NS_MODE_SSP, \ - .afe_ns_model_name = NULL, \ - .fixed_first_channel = true, \ - .vad_model_name = NULL, \ - .vad_min_speech_ms = 64, \ - .vad_min_noise_ms = 256, \ - .vad_mute_playback = false, \ -} -#endif +/** + * @brief Check AFE configuration and make sure it is correct. + * + * @warning If there is a configuration conflict, this function will modify some parameters. + * The guiding behind these modifications is to maintain the highest performance of the output audio and results. + * And remove the conflict between different algorithms. + * + * For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm. + * If SE(BSS) algorithm is deactivated, will only use the first microphone channel. + * + * @param afe_config Input AFE config + * + * @return afe_config_t* The modified AFE config + */ +afe_config_t *afe_config_check(afe_config_t *afe_config); + +/** + * @brief Parse input format + * + * @param input_format The input format, same with afe_config_init() function + * @param pcm_config The pcm config + * + * @return true if the input format is parsed successfully, otherwise false + */ +bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config); + +/** + * @brief Parse I2S input data + * + * @param data The input multi channel data + * @param frame_size The frame size of input, it is also the size of single channel data + * @param mic_data The output microphone data + * @param ref_data The output playback reference data + * @param pcm_config The pcm config + * + */ +void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config); + +/** + * @brief Parse input data, from interleaved arrangement to contiguous arrangement + * + * @param data The input multi channel data + * @param frame_size The frame size of input, it is also the size of single channel data + * @param channel_num The channel number of data + * @param out_data The output data + * + */ +void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data); + +/** + * @brief Format input data, from contiguous arrangement to interleaved arrangement + * + * @param data The input multi channel data + * @param frame_size The frame size of input, it is also the size of single channel data + * @param channel_num The channel number of data + * @param out_data The output data + * + */ +void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data); + +/** + * @brief Adjust the gain of input data + * + * @warning the input data will be modified inplace. + * + * @param data The input audio data + * @param frame_size The frame size of input, it is also the size of single channel data + * @param factor The gain factor + * + * @return int16_t* The output audio data + */ +int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor); + +/** + * @brief Adjust the gain of input data + * + * @warning the input data will be modified inplace. + * + * @param in_data The input audio data + * @param in_frame_size Input data frame size of input + * @param channel_num The channel number of input data, which is same as output data + * @param out_data The output audio data + * @param out_frame_size Onput data frame size of input + * + */ +void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size); + +/** + * @brief Copy the afe config + * + * @param dst_config The destination afe config + * @param src_config The source afe config + * + * @return The destination afe config + */ +afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config); + +/** + * @brief Print the afe config + * + * @param afe_config The afe config + */ +void afe_config_print(const afe_config_t *afe_config); + +/** + * @brief Allocate afe config + * + * @return The afe config pointer + */ +afe_config_t *afe_config_alloc(); + +/** + * @brief Free afe config + * + * @param afe_config The afe config pointer + */ +void afe_config_free(afe_config_t *afe_config); #ifdef __cplusplus } diff --git a/include/esp32p4/esp_afe_sr_iface.h b/include/esp32p4/esp_afe_sr_iface.h index 84d7000..f434c3e 100644 --- a/include/esp32p4/esp_afe_sr_iface.h +++ b/include/esp32p4/esp_afe_sr_iface.h @@ -1,7 +1,10 @@ #pragma once #include "stdint.h" +#include "stdlib.h" +#include "stdbool.h" #include "esp_afe_config.h" - +#include "freertos/FreeRTOS.h" +#include "freertos/task.h" #ifdef __cplusplus extern "C" { #endif @@ -13,13 +16,15 @@ extern "C" { //Opaque AFE_SR data container typedef struct esp_afe_sr_data_t esp_afe_sr_data_t; + + /** * @brief The state of vad */ typedef enum { - AFE_VAD_SILENCE = 0, // noise or silence - AFE_VAD_SPEECH // speech + AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence + AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech } afe_vad_state_t; /** @@ -27,7 +32,7 @@ typedef enum */ typedef struct afe_fetch_result_t { - int16_t *data; // the data of audio. + int16_t *data; // the target channel data of audio. int data_size; // the size of data. The unit is byte. int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated. int vad_cache_size; // the size of vad_cache. The unit is byte. @@ -36,10 +41,12 @@ typedef struct afe_fetch_result_t wakenet_state_t wakeup_state; // the value is wakenet_state_t int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1. int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1. - afe_vad_state_t vad_state; // the value is afe_vad_state_t + vad_state_t vad_state; // the value is afe_vad_state_t int trigger_channel_id; // the channel index of output int wake_word_length; // the length of wake word. The unit is the number of samples. int ret_value; // the return state of fetch function + int16_t *raw_data; // the multi-channel output data of audio. + int raw_data_channels; // the channel number of raw data void* reserved; // reserved for future use } afe_fetch_result_t; @@ -63,19 +70,11 @@ typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_confi typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe); /** - * @brief Get the total channel number which be config + * @brief Get the channel number * * @param afe The AFE_SR object to query * @return The amount of total channels */ -typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe); - -/** - * @brief Get the mic channel number which be config - * - * @param afe The AFE_SR object to query - * @return The amount of mic channels - */ typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe); /** @@ -104,12 +103,24 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* * @brief fetch enhanced samples of an audio stream from the AFE_SR * * @Warning The output is single channel data, no matter how many channels the input is. + * Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`. * * @param afe The AFE_SR object to query * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.) */ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe); +/** + * @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch` + * + * @Warning The output is single channel data, no matter how many channels the input is. + * + * @param afe The AFE_SR object to query + * @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result. + * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.) + */ +typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait); + /** * @brief reset ringbuf of AFE. * @@ -129,52 +140,37 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe); typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name); /** - * @brief Disable wakenet model. + * @brief Enable VAD algorithm. * * @param afe The AFE_SR object to query * @return -1: fail, 0: disabled, 1: enabled */ -typedef int (*esp_afe_sr_iface_op_disable_wakenet_t)(esp_afe_sr_data_t *afe); +typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe); /** - * @brief Enable wakenet model. + * @brief Disable one function/module/algorithm. * * @param afe The AFE_SR object to query * @return -1: fail, 0: disabled, 1: enabled */ -typedef int (*esp_afe_sr_iface_op_enable_wakenet_t)(esp_afe_sr_data_t *afe); +typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe); /** - * @brief Disable AEC algorithm. + * @brief Enable one function/module/algorithm. * * @param afe The AFE_SR object to query * @return -1: fail, 0: disabled, 1: enabled */ -typedef int (*esp_afe_sr_iface_op_disable_aec_t)(esp_afe_sr_data_t *afe); +typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe); /** - * @brief Enable AEC algorithm. + * @brief Print all functions/modules/algorithms pipeline. + * The pipeline is the order of the functions/modules/algorithms. + * The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output] * * @param afe The AFE_SR object to query - * @return -1: fail, 0: disabled, 1: enabled */ -typedef int (*esp_afe_sr_iface_op_enable_aec_t)(esp_afe_sr_data_t *afe); - -/** - * @brief Disable SE algorithm. - * - * @param afe The AFE_SR object to query - * @return -1: fail, 0: disabled, 1: enabled - */ -typedef int (*esp_afe_sr_iface_op_disable_se_t)(esp_afe_sr_data_t *afe); - -/** - * @brief Enable SE algorithm. - * - * @param afe The AFE_SR object to query - * @return -1: fail, 0: disabled, 1: enabled - */ -typedef int (*esp_afe_sr_iface_op_enable_se_t)(esp_afe_sr_data_t *afe); +typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe); /** * @brief Destroy a AFE_SR instance @@ -191,22 +187,41 @@ typedef struct { esp_afe_sr_iface_op_create_from_config_t create_from_config; esp_afe_sr_iface_op_feed_t feed; esp_afe_sr_iface_op_fetch_t fetch; + esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay; esp_afe_sr_iface_op_reset_buffer_t reset_buffer; esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize; esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize; - esp_afe_sr_iface_op_get_total_channel_num_t get_total_channel_num; - esp_afe_sr_iface_op_get_channel_num_t get_channel_num; + esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num + esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num; + esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num; esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate; esp_afe_sr_iface_op_set_wakenet_t set_wakenet; - esp_afe_sr_iface_op_disable_wakenet_t disable_wakenet; - esp_afe_sr_iface_op_enable_wakenet_t enable_wakenet; - esp_afe_sr_iface_op_disable_aec_t disable_aec; - esp_afe_sr_iface_op_enable_aec_t enable_aec; - esp_afe_sr_iface_op_disable_se_t disable_se; - esp_afe_sr_iface_op_enable_se_t enable_se; + esp_afe_sr_iface_op_disable_func_t disable_wakenet; + esp_afe_sr_iface_op_enable_func_t enable_wakenet; + esp_afe_sr_iface_op_disable_func_t disable_aec; + esp_afe_sr_iface_op_enable_func_t enable_aec; + esp_afe_sr_iface_op_disable_func_t disable_se; + esp_afe_sr_iface_op_enable_func_t enable_se; + esp_afe_sr_iface_op_disable_func_t disable_vad; + esp_afe_sr_iface_op_enable_func_t enable_vad; + esp_afe_sr_iface_op_disable_func_t disable_ns; + esp_afe_sr_iface_op_enable_func_t enable_ns; + esp_afe_sr_iface_op_disable_func_t disable_agc; + esp_afe_sr_iface_op_enable_func_t enable_agc; + esp_afe_sr_iface_op_print_pipeline_t print_pipeline; esp_afe_sr_iface_op_destroy_t destroy; } esp_afe_sr_iface_t; + +// struct is used to store the AFE handle and data for the AFE task +typedef struct +{ + esp_afe_sr_data_t *afe_data; + esp_afe_sr_iface_t *afe_handle; + TaskHandle_t feed_task; + TaskHandle_t fetch_task; +}afe_task_into_t; + #ifdef __cplusplus } #endif \ No newline at end of file diff --git a/include/esp32p4/esp_afe_sr_models.h b/include/esp32p4/esp_afe_sr_models.h index 39de63f..05a08d3 100644 --- a/include/esp32p4/esp_afe_sr_models.h +++ b/include/esp32p4/esp_afe_sr_models.h @@ -6,17 +6,7 @@ extern "C" { #include "esp_afe_sr_iface.h" - -#if CONFIG_AFE_INTERFACE_V1 -extern const esp_afe_sr_iface_t esp_afe_sr_v1; -extern const esp_afe_sr_iface_t esp_afe_vc_v1; -#define ESP_AFE_SR_HANDLE esp_afe_sr_v1 -#define ESP_AFE_VC_HANDLE esp_afe_vc_v1 - -#else -#error No valid afe selected. -#endif - +esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config); #ifdef __cplusplus } diff --git a/include/esp32p4/esp_agc.h b/include/esp32p4/esp_agc.h index 76d3015..8ea1c05 100644 --- a/include/esp32p4/esp_agc.h +++ b/include/esp32p4/esp_agc.h @@ -26,8 +26,15 @@ typedef enum { ESP_AGC_FRAME_SIZE_ERROR = -3, ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size } ESP_AGE_ERR; +typedef enum { + AGC_MODE_SR = -1, // Bypass WEBRTC AGC + AGC_MODE_0 = 0, // Only saturation protection + AGC_MODE_1 = 1, // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)] + AGC_MODE_2 = 2, // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)] + AGC_MODE_3 = 3, // Fixed Digital Gain [compressionGaindB (default 8 dB)] +} agc_mode_t; -void *esp_agc_open(int agc_mode, int sample_rate); +void *esp_agc_open(agc_mode_t agc_mode, int sample_rate); void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs); int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate); void esp_agc_close(void *agc_handle); diff --git a/include/esp32p4/esp_vad.h b/include/esp32p4/esp_vad.h index 90f8e20..f3c5dd4 100644 --- a/include/esp32p4/esp_vad.h +++ b/include/esp32p4/esp_vad.h @@ -78,6 +78,8 @@ vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state); typedef struct { vad_trigger_t *trigger; void *vad_inst; + int sample_rate; + int frame_size; }vad_handle_with_trigger_t; typedef vad_handle_with_trigger_t* vad_handle_t; @@ -100,31 +102,41 @@ vad_handle_t vad_create(vad_mode_t vad_mode); * @brief Creates an instance to the VAD structure. * * @param vad_mode Sets the VAD operating mode. - * @param min_speech_len Minimum frame number of speech duration - * @param min_noise_len Minimum frame number of noise duration + * @param sample_rate Sample rate in Hz + * @param one_frame_ms Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30. + * @param min_speech_ms Minimum speech duration, unit is ms + * @param min_noise_ms Minimum noise duration, unit is ms * @return * - NULL: Create failed * - Others: The instance of VAD */ -vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len); +vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len); /** * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. * - * @param inst The instance of VAD. - * - * @param data An array of 16-bit signed audio samples. - * + * @param handle The instance of VAD. + * @param data An array of 16-bit signed audio samples. * @param sample_rate_hz The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000. - * * @param one_frame_ms The length of the audio processing can be 10ms, 20ms, 30ms, default: 30. - * * @return * - VAD_SILENCE if no voice * - VAD_SPEECH if voice is detected * */ -vad_state_t vad_process(vad_handle_t inst, int16_t *data, int sample_rate_hz, int one_frame_ms); +vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms); + +/** + * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. + * + * @param handle The instance of VAD. + * @param data An array of 16-bit signed audio samples. + * @return + * - VAD_SILENCE if no voice + * - VAD_SPEECH if voice is detected + * + */ +vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data); /** * @brief Free the VAD instance diff --git a/include/esp32p4/esp_vadn_iface.h b/include/esp32p4/esp_vadn_iface.h index 1ec8bb9..bc2860f 100644 --- a/include/esp32p4/esp_vadn_iface.h +++ b/include/esp32p4/esp_vadn_iface.h @@ -1,6 +1,7 @@ #pragma once #include "esp_vad.h" #include "stdint.h" +#include "dl_lib_convq_queue.h" #ifdef __cplusplus extern "C" { @@ -98,6 +99,25 @@ typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model */ typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples); +/** + * @brief Feed MFCC of an audio stream to the vad model and detect whether is + * voice. + * + * @param model The model object to query + * @param cq An array of 16-bit MFCC. + * @return The index of wake words, return 0 if no wake word is detected, else + * the index of the wake words. + */ +typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq); + +/** + * @brief Get MFCC of an audio stream + * + * @param model The model object to query + * @return MFCC data + */ +typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model); + /** * @brief Get the triggered channel index. Channel index starts from zero * @@ -133,6 +153,8 @@ typedef struct { esp_vadn_iface_op_get_det_threshold_t get_det_threshold; esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel; esp_vadn_iface_op_detect_t detect; + esp_vadn_iface_op_detect_mfcc_t detect_mfcc; + esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data; esp_vadn_iface_op_clean_t clean; esp_vadn_iface_op_destroy_t destroy; } esp_vadn_iface_t; diff --git a/include/esp32p4/esp_webrtc.h b/include/esp32p4/esp_webrtc.h new file mode 100644 index 0000000..0b85bdd --- /dev/null +++ b/include/esp32p4/esp_webrtc.h @@ -0,0 +1,90 @@ +// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License +#ifndef _ESP_WEBRTC_H_ +#define _ESP_WEBRTC_H_ + + +#ifdef __cplusplus +extern "C" { +#endif +#include +#include "sr_ringbuf.h" +#include "esp_log.h" +#include "esp_agc.h" +#include "esp_ns.h" + +#include "esp_heap_caps.h" + +typedef struct { + void* ns_handle; + void* agc_handle; + int frame_size; + int sample_rate; + int16_t *buff; + int16_t *out_data; + sr_ringbuf_handle_t rb; +}webrtc_handle_t; + +/** + * @brief Creates an instance of webrtc. + * + * @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms. + * + * @param frame_length_ms The length of the audio processing + * @param ns_mode The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive + * @param agc_mode The model of AGC + * @param agc_gain The gain of AGC. default is 9 + * @param agc_target_level The target level of AGC. default is -3 dbfs + * @param sample_rate The sample rate of the audio. + * + * @return + * - NULL: Create failed + * - Others: The instance of webrtc + */ +webrtc_handle_t* webrtc_create( + int frame_length_ms, + int ns_mode, + agc_mode_t agc_mode, + int agc_gain, + int agc_target_level, + int sample_rate); + +/** + * @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression. + * + * @param handle The instance of NS. + * @param in_data An array of 16-bit signed audio samples. + * @param out_size The sample size of output data + * @param enable_ns Enable noise suppression + * @param enable_agc Enable automatic gain control + * + * @return data after noise suppression + */ +int16_t* webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc); + +/** + * @brief Free the webrtc instance + * + * @param handle The instance of webrtc. + * + * @return None + * + */ +void webrtc_destroy(webrtc_handle_t *handle); + +#ifdef __cplusplus +} +#endif + +#endif //_ESP_NS_H_ diff --git a/include/esp32p4/esp_wn_iface.h b/include/esp32p4/esp_wn_iface.h index bbcdcb9..44bab8d 100644 --- a/include/esp32p4/esp_wn_iface.h +++ b/include/esp32p4/esp_wn_iface.h @@ -1,5 +1,6 @@ #pragma once #include "stdint.h" +#include "dl_lib_convq_queue.h" #ifdef __cplusplus extern "C" { @@ -167,6 +168,25 @@ typedef void (*esp_wn_iface_op_clean_t)(model_iface_data_t *model); */ typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model); +/** + * @brief Feed MFCC of an audio stream to the vad model and detect whether is + * voice. + * + * @param model The model object to query + * @param cq An array of 16-bit MFCC. + * @return The index of wake words, return 0 if no wake word is detected, else + * the index of the wake words. + */ +typedef wakenet_state_t (*esp_wn_iface_op_detect_mfcc_t)(model_iface_data_t *model, int16_t *samples, dl_convq_queue_t *cq); + +/** + * @brief Get MFCC of an audio stream + * + * @param model The model object to query + * @return MFCC data + */ +typedef dl_convq_queue_t* (*esp_wn_iface_op_get_mfcc_data_t)(model_iface_data_t *model); + /** * This structure contains the functions used to do operations on a wake word detection model. @@ -184,6 +204,8 @@ typedef struct { esp_wn_iface_op_get_triggered_channel_t get_triggered_channel; esp_wn_iface_op_get_vol_gain_t get_vol_gain; esp_wn_iface_op_detect_t detect; + esp_wn_iface_op_detect_mfcc_t detect_mfcc; + esp_wn_iface_op_get_mfcc_data_t get_mfcc_data; esp_wn_iface_op_clean_t clean; esp_wn_iface_op_destroy_t destroy; } esp_wn_iface_t; diff --git a/include/esp32s3/esp_aec.h b/include/esp32s3/esp_aec.h index deb031c..36de9c1 100644 --- a/include/esp32s3/esp_aec.h +++ b/include/esp32s3/esp_aec.h @@ -21,80 +21,72 @@ extern "C" { #endif #define USE_AEC_FFT // Not kiss_fft -#define AEC_USE_SPIRAM 0 #define AEC_SAMPLE_RATE 16000 // Only Support 16000Hz -//#define AEC_FRAME_LENGTH_MS 16 #define AEC_FRAME_LENGTH_MS 32 -#define AEC_FILTER_LENGTH 1200 // Number of samples of echo to cancel -typedef void* aec_handle_t; +typedef struct aec_handle_t aec_handle_t; +typedef enum { + AEC_MODE_SR_LOW_COST = 0, // Low Cost AEC fro speech recognition + AEC_MODE_SR_HIGH_PERF = 1, // High Perforamce AEC for speech recognition + AEC_MODE_VOIP_LOW_COST = 3, // Low Cost AEC for voice communication + AEC_MODE_VOIP_HIGH_PERF = 4, // High Perforamce AEC for voice communication +} aec_mode_t; /** * @brief Creates an instance to the AEC structure. + * Please get frame size by aec_get_chunksize() function * - * @deprecated This API will be deprecated after version 1.0, please use aec_pro_create - * * @param sample_rate The Sampling frequency (Hz) must be 16000. - * - * @param frame_length The length of the audio processing must be 16ms. - * - * @param filter_length Number of samples of echo to cancel. - * + * @param filter_length Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption. + * @param channel_num The input microphone channel number + * @param mode The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST * @return * - NULL: Create failed * - Others: The instance of AEC */ -aec_handle_t aec_create(int sample_rate, int frame_length, int filter_length); +aec_handle_t *aec_create(int sample_rate, int filter_length, int channel_num, aec_mode_t mode); /** - * @brief Creates an instance to the AEC structure. + * @brief Creates an instance to the AEC structure, same with aec_create(). * - * @deprecated This API will be deprecated after version 1.0, please use aec_pro_create - * - * @param sample_rate The Sampling frequency (Hz) must be 16000. - * - * @param frame_length The length of the audio processing must be 16ms. - * - * @param filter_length Number of samples of echo to cancel. - * - * @param nch Number of input signal channel. - * + * @param filter_length Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption. + * @param channel_num The input microphone channel number + * @param mode The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST * @return * - NULL: Create failed * - Others: The instance of AEC */ -aec_handle_t aec_create_multimic(int sample_rate, int frame_length, int filter_length, int nch); - -/** - * @brief Creates an instance of more powerful AEC. - * - * @param frame_length Length of input signal. Must be 16ms if mode is 0; otherwise could be 16ms or 32ms. Length of input signal to aec_process must be modified accordingly. - * - * @param nch Number of microphones. - * - * @param mode Mode of AEC (0 to 5), indicating aggressiveness and RAM allocation. 0: mild; 1 or 2: medium (1: internal RAM, 2: SPIRAM); 3 and 4: aggressive (3: internal RAM, 4: SPIRAM); 5: agressive, accelerated for ESP32-S3. - * - * @return - * - NULL: Create failed - * - Others: An Instance of AEC - */ -aec_handle_t aec_pro_create(int frame_length, int nch, int mode); +aec_handle_t *aec_pro_create(int filter_length, int channel_num, aec_mode_t mode); /** * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic. * - * @param inst The instance of AEC. - * + * @warning The indata, refdata and outdata must be 16-bit signed. please allocate memory by heap_caps_aligned_alloc(). + * + * @param inst The instance of AEC. Format for multi-channel data is "ch0 ch0 ch0 ..., ch1 ch1 ch1 ..." * @param indata An array of 16-bit signed audio samples from mic. - * * @param refdata An array of 16-bit signed audio samples sent to the speaker. - * - * @param outdata Returns near-end signal with echo removed. - * + * @param outdata Returns near-end signal with echo removed. Format for multi-channel data is "ch0 ch0 ch0..., ch1 ch1 ch1 ..." * @return None * */ -void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int16_t *outdata); +void aec_process(const aec_handle_t *handel, int16_t *indata, int16_t *refdata, int16_t *outdata); + +/** + * @brief Get frame size of AEC (the samples of one frame) + * @param handle The instance of AEC. + * @return Frame size + */ +int aec_get_chunksize(const aec_handle_t *handle); + +/** + * @brief Get AEC mode string + * + * @param aec_mode The mode of AEC. + * + * @return AEC mode string + */ +char * aec_get_mode_string(aec_mode_t aec_mode); /** * @brief Free the AEC instance @@ -104,7 +96,7 @@ void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int * @return None * */ -void aec_destroy(aec_handle_t inst); +void aec_destroy(aec_handle_t *handel); #ifdef __cplusplus } diff --git a/include/esp32s3/esp_afe_config.h b/include/esp32s3/esp_afe_config.h index 5f70735..694caa2 100644 --- a/include/esp32s3/esp_afe_config.h +++ b/include/esp32s3/esp_afe_config.h @@ -1,24 +1,41 @@ #pragma once #include "stdint.h" +#include "stdbool.h" +#include "stdlib.h" #include "esp_wn_iface.h" #include "esp_wn_models.h" #include "esp_vad.h" - +#include "esp_aec.h" +#include "esp_agc.h" +#include "model_path.h" +#include "esp_vadn_models.h" +#include "esp_nsn_models.h" #ifdef __cplusplus extern "C" { #endif //AFE: Audio Front-End //SR: Speech Recognition -//afe_sr/AFE_SR: the audio front-end for speech recognition - +//VC: Voice Communication //Set AFE_SR mode typedef enum { - SR_MODE_LOW_COST = 0, - SR_MODE_HIGH_PERF = 1 + SR_MODE_LOW_COST = 0, //Deprecated, please use afe_mode_t, AFE mode: low cost mode + SR_MODE_HIGH_PERF = 1, //Deprecated, please use afe_mode_t, AFE mode: high performance mode } afe_sr_mode_t; +//Set AFE mode +typedef enum { + AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode + AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode +} afe_mode_t; + +//Set AFE type +typedef enum { + AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression + AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression +} afe_type_t; + typedef enum { AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance @@ -26,24 +43,30 @@ typedef enum { } afe_memory_alloc_mode_t; typedef enum { - AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of audio fed to multinet is -9dB - AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of audio fed to multinet is -6dB - AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of audio fed to multinet is -3dB + AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB + AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB + AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain } afe_mn_peak_agc_mode_t; typedef struct { - int total_ch_num; // total channel num. It must be: total_ch_num = mic_num + ref_num - int mic_num; // mic channel num - int ref_num; // reference channel num - int sample_rate; // sample rate of audio + int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel + int mic_num; // microphone channel number + uint8_t* mic_ids; // microphone channel indices + int ref_num; // playback reference channel number + uint8_t* ref_ids; // playback reference channel indices + int sample_rate; // sample rate of audio } afe_pcm_config_t; typedef enum { - NS_MODE_SSP = 0, // speech signal process method - NS_MODE_NET = 1, // deep noise suppression net method + AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC" + AFE_NS_MODE_NET = 1, // please use model name of NSNET } afe_ns_mode_t; +typedef enum { + AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC + AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated +} afe_agc_mode_t; /** * @brief Function to get the debug audio data @@ -66,148 +89,192 @@ typedef struct { } afe_debug_hook_t; typedef struct { - bool aec_init; - bool se_init; - bool vad_init; + /********** AEC(Acoustic Echo Cancellation) **********/ + bool aec_init; // Whether to init aec + aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF + int aec_filter_length; // The filter length of aec + + /********** SE(Speech Enhancement, microphone array processing) **********/ + bool se_init; // Whether to init se + + /********** NS(Noise Suppression) **********/ + bool ns_init; // Whether to init ns + char *ns_model_name; // Model name of ns + afe_ns_mode_t afe_ns_mode; // Model mode of ns + + /********** VAD(Voice Activity Detection) **********/ + bool vad_init; // Whether to init vad + vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4 + char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used. + int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms + int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms + bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false + bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false + + /********** WakeNet(Wake Word Engine) **********/ bool wakenet_init; - bool voice_communication_init; - bool voice_communication_agc_init; // AGC swich for voice communication - int voice_communication_agc_gain; // AGC gain(dB) for voice communication - vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4 char *wakenet_model_name; // The model name of wakenet 1 char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2 - det_mode_t wakenet_mode; - afe_sr_mode_t afe_mode; - int afe_perferred_core; - int afe_perferred_priority; - int afe_ringbuf_size; - afe_memory_alloc_mode_t memory_alloc_mode; - float afe_linear_gain; // The linear gain for sr output(note: invaild for vc), the value should be in [0.1, 10.0]. - // This value acts directly on the output amplitude: out_linear_gain * amplitude. - afe_mn_peak_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain. + det_mode_t wakenet_mode; // The mode of wakenet + + /********** AGC(Automatic Gain Control) **********/ + bool agc_init; // Whether to init agc + afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain. + int agc_compression_gain_db; // Compression gain in dB (default 9) + int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3) + + /********** General AFE(Audio Front End) parameter **********/ afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function. + afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function. + int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function. + int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer. + afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM + float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude. bool debug_init; - afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX]; - afe_ns_mode_t afe_ns_mode; - char *afe_ns_model_name; bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone // otherwise, select channel number by wakenet - char *vad_model_name; // The model name of vad, support vadnet1 and vadnet1_small - int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms - int vad_min_noise_ms; // The minimum duration of noise/silence in ms. It should be bigger than 64 ms - bool vad_mute_playback; // If true, the playback will be muted for vad detection } afe_config_t; +/** + * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format. + * You can manually fine-tune it after creating the configuration + * + * The input format: + * M to represent the microphone channel + * R to represent the playback reference channel + * N to represent an unknown or unused channel + * + * For example, input_format="MMNR" indicates that the input data consists of four channels, + * which are the microphone channel, the microphone channel, an unused channel, and the playback channel + * + * @param input_format The input format + * @param models Models from partition, which is configured by Kconfig + * @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC + * @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + * + * @return afe_config_t* The default config of afe + */ +afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode); -#if CONFIG_IDF_TARGET_ESP32 -#define AFE_CONFIG_DEFAULT() { \ - .aec_init = true, \ - .se_init = true, \ - .vad_init = true, \ - .wakenet_init = true, \ - .voice_communication_init = false, \ - .voice_communication_agc_init = false, \ - .voice_communication_agc_gain = 15, \ - .vad_mode = VAD_MODE_0, \ - .wakenet_model_name = NULL, \ - .wakenet_model_name_2 = NULL, \ - .wakenet_mode = DET_MODE_90, \ - .afe_mode = SR_MODE_HIGH_PERF, \ - .afe_perferred_core = 0, \ - .afe_perferred_priority = 5, \ - .afe_ringbuf_size = 50, \ - .memory_alloc_mode = AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE, \ - .afe_linear_gain = 1.0, \ - .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \ - .pcm_config = { \ - .total_ch_num = 2, \ - .mic_num = 1, \ - .ref_num = 1, \ - .sample_rate = 16000, \ - }, \ - .debug_init = false, \ - .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \ - .afe_ns_mode = NS_MODE_SSP, \ - .afe_ns_model_name = NULL, \ - .fixed_first_channel = true, \ - .vad_model_name = NULL, \ - .vad_min_speech_ms = 64, \ - .vad_min_noise_ms = 256, \ - .vad_mute_playback = false, \ -} -#elif CONFIG_IDF_TARGET_ESP32P4 -#define AFE_CONFIG_DEFAULT() { \ - .aec_init = true, \ - .se_init = true, \ - .vad_init = true, \ - .wakenet_init = true, \ - .voice_communication_init = false, \ - .voice_communication_agc_init = false, \ - .voice_communication_agc_gain = 15, \ - .vad_mode = VAD_MODE_0, \ - .wakenet_model_name = NULL, \ - .wakenet_model_name_2 = NULL, \ - .wakenet_mode = DET_MODE_90, \ - .afe_mode = SR_MODE_LOW_COST, \ - .afe_perferred_core = 0, \ - .afe_perferred_priority = 5, \ - .afe_ringbuf_size = 50, \ - .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \ - .afe_linear_gain = 1.0, \ - .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \ - .pcm_config = { \ - .total_ch_num = 2, \ - .mic_num = 1, \ - .ref_num = 1, \ - .sample_rate = 16000, \ - }, \ - .debug_init = false, \ - .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \ - .afe_ns_mode = NS_MODE_SSP, \ - .afe_ns_model_name = NULL, \ - .fixed_first_channel = true, \ - .vad_model_name = NULL, \ - .vad_min_speech_ms = 64, \ - .vad_min_noise_ms = 256, \ - .vad_mute_playback = false, \ -} -#elif CONFIG_IDF_TARGET_ESP32S3 -#define AFE_CONFIG_DEFAULT() { \ - .aec_init = true, \ - .se_init = true, \ - .vad_init = true, \ - .wakenet_init = true, \ - .voice_communication_init = false, \ - .voice_communication_agc_init = false, \ - .voice_communication_agc_gain = 15, \ - .vad_mode = VAD_MODE_0, \ - .wakenet_model_name = NULL, \ - .wakenet_model_name_2 = NULL, \ - .wakenet_mode = DET_MODE_2CH_90, \ - .afe_mode = SR_MODE_LOW_COST, \ - .afe_perferred_core = 0, \ - .afe_perferred_priority = 5, \ - .afe_ringbuf_size = 50, \ - .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \ - .afe_linear_gain = 1.0, \ - .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \ - .pcm_config = { \ - .total_ch_num = 3, \ - .mic_num = 2, \ - .ref_num = 1, \ - .sample_rate = 16000, \ - }, \ - .debug_init = false, \ - .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \ - .afe_ns_mode = NS_MODE_SSP, \ - .afe_ns_model_name = NULL, \ - .fixed_first_channel = true, \ - .vad_model_name = NULL, \ - .vad_min_speech_ms = 64, \ - .vad_min_noise_ms = 256, \ - .vad_mute_playback = false, \ -} -#endif +/** + * @brief Check AFE configuration and make sure it is correct. + * + * @warning If there is a configuration conflict, this function will modify some parameters. + * The guiding behind these modifications is to maintain the highest performance of the output audio and results. + * And remove the conflict between different algorithms. + * + * For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm. + * If SE(BSS) algorithm is deactivated, will only use the first microphone channel. + * + * @param afe_config Input AFE config + * + * @return afe_config_t* The modified AFE config + */ +afe_config_t *afe_config_check(afe_config_t *afe_config); + +/** + * @brief Parse input format + * + * @param input_format The input format, same with afe_config_init() function + * @param pcm_config The pcm config + * + * @return true if the input format is parsed successfully, otherwise false + */ +bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config); + +/** + * @brief Parse I2S input data + * + * @param data The input multi channel data + * @param frame_size The frame size of input, it is also the size of single channel data + * @param mic_data The output microphone data + * @param ref_data The output playback reference data + * @param pcm_config The pcm config + * + */ +void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config); + +/** + * @brief Parse input data, from interleaved arrangement to contiguous arrangement + * + * @param data The input multi channel data + * @param frame_size The frame size of input, it is also the size of single channel data + * @param channel_num The channel number of data + * @param out_data The output data + * + */ +void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data); + +/** + * @brief Format input data, from contiguous arrangement to interleaved arrangement + * + * @param data The input multi channel data + * @param frame_size The frame size of input, it is also the size of single channel data + * @param channel_num The channel number of data + * @param out_data The output data + * + */ +void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data); + +/** + * @brief Adjust the gain of input data + * + * @warning the input data will be modified inplace. + * + * @param data The input audio data + * @param frame_size The frame size of input, it is also the size of single channel data + * @param factor The gain factor + * + * @return int16_t* The output audio data + */ +int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor); + +/** + * @brief Adjust the gain of input data + * + * @warning the input data will be modified inplace. + * + * @param in_data The input audio data + * @param in_frame_size Input data frame size of input + * @param channel_num The channel number of input data, which is same as output data + * @param out_data The output audio data + * @param out_frame_size Onput data frame size of input + * + */ +void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size); + +/** + * @brief Copy the afe config + * + * @param dst_config The destination afe config + * @param src_config The source afe config + * + * @return The destination afe config + */ +afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config); + +/** + * @brief Print the afe config + * + * @param afe_config The afe config + */ +void afe_config_print(const afe_config_t *afe_config); + +/** + * @brief Allocate afe config + * + * @return The afe config pointer + */ +afe_config_t *afe_config_alloc(); + +/** + * @brief Free afe config + * + * @param afe_config The afe config pointer + */ +void afe_config_free(afe_config_t *afe_config); #ifdef __cplusplus } diff --git a/include/esp32s3/esp_afe_sr_iface.h b/include/esp32s3/esp_afe_sr_iface.h index 84d7000..f434c3e 100644 --- a/include/esp32s3/esp_afe_sr_iface.h +++ b/include/esp32s3/esp_afe_sr_iface.h @@ -1,7 +1,10 @@ #pragma once #include "stdint.h" +#include "stdlib.h" +#include "stdbool.h" #include "esp_afe_config.h" - +#include "freertos/FreeRTOS.h" +#include "freertos/task.h" #ifdef __cplusplus extern "C" { #endif @@ -13,13 +16,15 @@ extern "C" { //Opaque AFE_SR data container typedef struct esp_afe_sr_data_t esp_afe_sr_data_t; + + /** * @brief The state of vad */ typedef enum { - AFE_VAD_SILENCE = 0, // noise or silence - AFE_VAD_SPEECH // speech + AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence + AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech } afe_vad_state_t; /** @@ -27,7 +32,7 @@ typedef enum */ typedef struct afe_fetch_result_t { - int16_t *data; // the data of audio. + int16_t *data; // the target channel data of audio. int data_size; // the size of data. The unit is byte. int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated. int vad_cache_size; // the size of vad_cache. The unit is byte. @@ -36,10 +41,12 @@ typedef struct afe_fetch_result_t wakenet_state_t wakeup_state; // the value is wakenet_state_t int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1. int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1. - afe_vad_state_t vad_state; // the value is afe_vad_state_t + vad_state_t vad_state; // the value is afe_vad_state_t int trigger_channel_id; // the channel index of output int wake_word_length; // the length of wake word. The unit is the number of samples. int ret_value; // the return state of fetch function + int16_t *raw_data; // the multi-channel output data of audio. + int raw_data_channels; // the channel number of raw data void* reserved; // reserved for future use } afe_fetch_result_t; @@ -63,19 +70,11 @@ typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_confi typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe); /** - * @brief Get the total channel number which be config + * @brief Get the channel number * * @param afe The AFE_SR object to query * @return The amount of total channels */ -typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe); - -/** - * @brief Get the mic channel number which be config - * - * @param afe The AFE_SR object to query - * @return The amount of mic channels - */ typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe); /** @@ -104,12 +103,24 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* * @brief fetch enhanced samples of an audio stream from the AFE_SR * * @Warning The output is single channel data, no matter how many channels the input is. + * Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`. * * @param afe The AFE_SR object to query * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.) */ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe); +/** + * @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch` + * + * @Warning The output is single channel data, no matter how many channels the input is. + * + * @param afe The AFE_SR object to query + * @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result. + * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.) + */ +typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait); + /** * @brief reset ringbuf of AFE. * @@ -129,52 +140,37 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe); typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name); /** - * @brief Disable wakenet model. + * @brief Enable VAD algorithm. * * @param afe The AFE_SR object to query * @return -1: fail, 0: disabled, 1: enabled */ -typedef int (*esp_afe_sr_iface_op_disable_wakenet_t)(esp_afe_sr_data_t *afe); +typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe); /** - * @brief Enable wakenet model. + * @brief Disable one function/module/algorithm. * * @param afe The AFE_SR object to query * @return -1: fail, 0: disabled, 1: enabled */ -typedef int (*esp_afe_sr_iface_op_enable_wakenet_t)(esp_afe_sr_data_t *afe); +typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe); /** - * @brief Disable AEC algorithm. + * @brief Enable one function/module/algorithm. * * @param afe The AFE_SR object to query * @return -1: fail, 0: disabled, 1: enabled */ -typedef int (*esp_afe_sr_iface_op_disable_aec_t)(esp_afe_sr_data_t *afe); +typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe); /** - * @brief Enable AEC algorithm. + * @brief Print all functions/modules/algorithms pipeline. + * The pipeline is the order of the functions/modules/algorithms. + * The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output] * * @param afe The AFE_SR object to query - * @return -1: fail, 0: disabled, 1: enabled */ -typedef int (*esp_afe_sr_iface_op_enable_aec_t)(esp_afe_sr_data_t *afe); - -/** - * @brief Disable SE algorithm. - * - * @param afe The AFE_SR object to query - * @return -1: fail, 0: disabled, 1: enabled - */ -typedef int (*esp_afe_sr_iface_op_disable_se_t)(esp_afe_sr_data_t *afe); - -/** - * @brief Enable SE algorithm. - * - * @param afe The AFE_SR object to query - * @return -1: fail, 0: disabled, 1: enabled - */ -typedef int (*esp_afe_sr_iface_op_enable_se_t)(esp_afe_sr_data_t *afe); +typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe); /** * @brief Destroy a AFE_SR instance @@ -191,22 +187,41 @@ typedef struct { esp_afe_sr_iface_op_create_from_config_t create_from_config; esp_afe_sr_iface_op_feed_t feed; esp_afe_sr_iface_op_fetch_t fetch; + esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay; esp_afe_sr_iface_op_reset_buffer_t reset_buffer; esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize; esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize; - esp_afe_sr_iface_op_get_total_channel_num_t get_total_channel_num; - esp_afe_sr_iface_op_get_channel_num_t get_channel_num; + esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num + esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num; + esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num; esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate; esp_afe_sr_iface_op_set_wakenet_t set_wakenet; - esp_afe_sr_iface_op_disable_wakenet_t disable_wakenet; - esp_afe_sr_iface_op_enable_wakenet_t enable_wakenet; - esp_afe_sr_iface_op_disable_aec_t disable_aec; - esp_afe_sr_iface_op_enable_aec_t enable_aec; - esp_afe_sr_iface_op_disable_se_t disable_se; - esp_afe_sr_iface_op_enable_se_t enable_se; + esp_afe_sr_iface_op_disable_func_t disable_wakenet; + esp_afe_sr_iface_op_enable_func_t enable_wakenet; + esp_afe_sr_iface_op_disable_func_t disable_aec; + esp_afe_sr_iface_op_enable_func_t enable_aec; + esp_afe_sr_iface_op_disable_func_t disable_se; + esp_afe_sr_iface_op_enable_func_t enable_se; + esp_afe_sr_iface_op_disable_func_t disable_vad; + esp_afe_sr_iface_op_enable_func_t enable_vad; + esp_afe_sr_iface_op_disable_func_t disable_ns; + esp_afe_sr_iface_op_enable_func_t enable_ns; + esp_afe_sr_iface_op_disable_func_t disable_agc; + esp_afe_sr_iface_op_enable_func_t enable_agc; + esp_afe_sr_iface_op_print_pipeline_t print_pipeline; esp_afe_sr_iface_op_destroy_t destroy; } esp_afe_sr_iface_t; + +// struct is used to store the AFE handle and data for the AFE task +typedef struct +{ + esp_afe_sr_data_t *afe_data; + esp_afe_sr_iface_t *afe_handle; + TaskHandle_t feed_task; + TaskHandle_t fetch_task; +}afe_task_into_t; + #ifdef __cplusplus } #endif \ No newline at end of file diff --git a/include/esp32s3/esp_afe_sr_models.h b/include/esp32s3/esp_afe_sr_models.h index 39de63f..05a08d3 100644 --- a/include/esp32s3/esp_afe_sr_models.h +++ b/include/esp32s3/esp_afe_sr_models.h @@ -6,17 +6,7 @@ extern "C" { #include "esp_afe_sr_iface.h" - -#if CONFIG_AFE_INTERFACE_V1 -extern const esp_afe_sr_iface_t esp_afe_sr_v1; -extern const esp_afe_sr_iface_t esp_afe_vc_v1; -#define ESP_AFE_SR_HANDLE esp_afe_sr_v1 -#define ESP_AFE_VC_HANDLE esp_afe_vc_v1 - -#else -#error No valid afe selected. -#endif - +esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config); #ifdef __cplusplus } diff --git a/include/esp32s3/esp_agc.h b/include/esp32s3/esp_agc.h index 76d3015..8ea1c05 100644 --- a/include/esp32s3/esp_agc.h +++ b/include/esp32s3/esp_agc.h @@ -26,8 +26,15 @@ typedef enum { ESP_AGC_FRAME_SIZE_ERROR = -3, ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size } ESP_AGE_ERR; +typedef enum { + AGC_MODE_SR = -1, // Bypass WEBRTC AGC + AGC_MODE_0 = 0, // Only saturation protection + AGC_MODE_1 = 1, // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)] + AGC_MODE_2 = 2, // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)] + AGC_MODE_3 = 3, // Fixed Digital Gain [compressionGaindB (default 8 dB)] +} agc_mode_t; -void *esp_agc_open(int agc_mode, int sample_rate); +void *esp_agc_open(agc_mode_t agc_mode, int sample_rate); void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs); int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate); void esp_agc_close(void *agc_handle); diff --git a/include/esp32s3/esp_vad.h b/include/esp32s3/esp_vad.h index 90f8e20..f3c5dd4 100644 --- a/include/esp32s3/esp_vad.h +++ b/include/esp32s3/esp_vad.h @@ -78,6 +78,8 @@ vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state); typedef struct { vad_trigger_t *trigger; void *vad_inst; + int sample_rate; + int frame_size; }vad_handle_with_trigger_t; typedef vad_handle_with_trigger_t* vad_handle_t; @@ -100,31 +102,41 @@ vad_handle_t vad_create(vad_mode_t vad_mode); * @brief Creates an instance to the VAD structure. * * @param vad_mode Sets the VAD operating mode. - * @param min_speech_len Minimum frame number of speech duration - * @param min_noise_len Minimum frame number of noise duration + * @param sample_rate Sample rate in Hz + * @param one_frame_ms Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30. + * @param min_speech_ms Minimum speech duration, unit is ms + * @param min_noise_ms Minimum noise duration, unit is ms * @return * - NULL: Create failed * - Others: The instance of VAD */ -vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len); +vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len); /** * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. * - * @param inst The instance of VAD. - * - * @param data An array of 16-bit signed audio samples. - * + * @param handle The instance of VAD. + * @param data An array of 16-bit signed audio samples. * @param sample_rate_hz The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000. - * * @param one_frame_ms The length of the audio processing can be 10ms, 20ms, 30ms, default: 30. - * * @return * - VAD_SILENCE if no voice * - VAD_SPEECH if voice is detected * */ -vad_state_t vad_process(vad_handle_t inst, int16_t *data, int sample_rate_hz, int one_frame_ms); +vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms); + +/** + * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. + * + * @param handle The instance of VAD. + * @param data An array of 16-bit signed audio samples. + * @return + * - VAD_SILENCE if no voice + * - VAD_SPEECH if voice is detected + * + */ +vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data); /** * @brief Free the VAD instance diff --git a/include/esp32s3/esp_vadn_iface.h b/include/esp32s3/esp_vadn_iface.h index 1ec8bb9..bc2860f 100644 --- a/include/esp32s3/esp_vadn_iface.h +++ b/include/esp32s3/esp_vadn_iface.h @@ -1,6 +1,7 @@ #pragma once #include "esp_vad.h" #include "stdint.h" +#include "dl_lib_convq_queue.h" #ifdef __cplusplus extern "C" { @@ -98,6 +99,25 @@ typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model */ typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples); +/** + * @brief Feed MFCC of an audio stream to the vad model and detect whether is + * voice. + * + * @param model The model object to query + * @param cq An array of 16-bit MFCC. + * @return The index of wake words, return 0 if no wake word is detected, else + * the index of the wake words. + */ +typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq); + +/** + * @brief Get MFCC of an audio stream + * + * @param model The model object to query + * @return MFCC data + */ +typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model); + /** * @brief Get the triggered channel index. Channel index starts from zero * @@ -133,6 +153,8 @@ typedef struct { esp_vadn_iface_op_get_det_threshold_t get_det_threshold; esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel; esp_vadn_iface_op_detect_t detect; + esp_vadn_iface_op_detect_mfcc_t detect_mfcc; + esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data; esp_vadn_iface_op_clean_t clean; esp_vadn_iface_op_destroy_t destroy; } esp_vadn_iface_t; diff --git a/include/esp32s3/esp_webrtc.h b/include/esp32s3/esp_webrtc.h new file mode 100644 index 0000000..0b85bdd --- /dev/null +++ b/include/esp32s3/esp_webrtc.h @@ -0,0 +1,90 @@ +// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at + +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License +#ifndef _ESP_WEBRTC_H_ +#define _ESP_WEBRTC_H_ + + +#ifdef __cplusplus +extern "C" { +#endif +#include +#include "sr_ringbuf.h" +#include "esp_log.h" +#include "esp_agc.h" +#include "esp_ns.h" + +#include "esp_heap_caps.h" + +typedef struct { + void* ns_handle; + void* agc_handle; + int frame_size; + int sample_rate; + int16_t *buff; + int16_t *out_data; + sr_ringbuf_handle_t rb; +}webrtc_handle_t; + +/** + * @brief Creates an instance of webrtc. + * + * @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms. + * + * @param frame_length_ms The length of the audio processing + * @param ns_mode The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive + * @param agc_mode The model of AGC + * @param agc_gain The gain of AGC. default is 9 + * @param agc_target_level The target level of AGC. default is -3 dbfs + * @param sample_rate The sample rate of the audio. + * + * @return + * - NULL: Create failed + * - Others: The instance of webrtc + */ +webrtc_handle_t* webrtc_create( + int frame_length_ms, + int ns_mode, + agc_mode_t agc_mode, + int agc_gain, + int agc_target_level, + int sample_rate); + +/** + * @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression. + * + * @param handle The instance of NS. + * @param in_data An array of 16-bit signed audio samples. + * @param out_size The sample size of output data + * @param enable_ns Enable noise suppression + * @param enable_agc Enable automatic gain control + * + * @return data after noise suppression + */ +int16_t* webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc); + +/** + * @brief Free the webrtc instance + * + * @param handle The instance of webrtc. + * + * @return None + * + */ +void webrtc_destroy(webrtc_handle_t *handle); + +#ifdef __cplusplus +} +#endif + +#endif //_ESP_NS_H_ diff --git a/include/esp32s3/esp_wn_iface.h b/include/esp32s3/esp_wn_iface.h index bbcdcb9..44bab8d 100644 --- a/include/esp32s3/esp_wn_iface.h +++ b/include/esp32s3/esp_wn_iface.h @@ -1,5 +1,6 @@ #pragma once #include "stdint.h" +#include "dl_lib_convq_queue.h" #ifdef __cplusplus extern "C" { @@ -167,6 +168,25 @@ typedef void (*esp_wn_iface_op_clean_t)(model_iface_data_t *model); */ typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model); +/** + * @brief Feed MFCC of an audio stream to the vad model and detect whether is + * voice. + * + * @param model The model object to query + * @param cq An array of 16-bit MFCC. + * @return The index of wake words, return 0 if no wake word is detected, else + * the index of the wake words. + */ +typedef wakenet_state_t (*esp_wn_iface_op_detect_mfcc_t)(model_iface_data_t *model, int16_t *samples, dl_convq_queue_t *cq); + +/** + * @brief Get MFCC of an audio stream + * + * @param model The model object to query + * @return MFCC data + */ +typedef dl_convq_queue_t* (*esp_wn_iface_op_get_mfcc_data_t)(model_iface_data_t *model); + /** * This structure contains the functions used to do operations on a wake word detection model. @@ -184,6 +204,8 @@ typedef struct { esp_wn_iface_op_get_triggered_channel_t get_triggered_channel; esp_wn_iface_op_get_vol_gain_t get_vol_gain; esp_wn_iface_op_detect_t detect; + esp_wn_iface_op_detect_mfcc_t detect_mfcc; + esp_wn_iface_op_get_mfcc_data_t get_mfcc_data; esp_wn_iface_op_clean_t clean; esp_wn_iface_op_destroy_t destroy; } esp_wn_iface_t; diff --git a/lib/esp32/libc_speech_features.a b/lib/esp32/libc_speech_features.a index a105141..840eeae 100644 Binary files a/lib/esp32/libc_speech_features.a and b/lib/esp32/libc_speech_features.a differ diff --git a/lib/esp32/libdl_lib.a b/lib/esp32/libdl_lib.a index 97717e0..2e143f3 100644 Binary files a/lib/esp32/libdl_lib.a and b/lib/esp32/libdl_lib.a differ diff --git a/lib/esp32/libesp_audio_front_end.a b/lib/esp32/libesp_audio_front_end.a index 41d45b4..13e39be 100644 Binary files a/lib/esp32/libesp_audio_front_end.a and b/lib/esp32/libesp_audio_front_end.a differ diff --git a/lib/esp32/libesp_audio_processor.a b/lib/esp32/libesp_audio_processor.a index 8cdf8cf..8f3cbf1 100644 Binary files a/lib/esp32/libesp_audio_processor.a and b/lib/esp32/libesp_audio_processor.a differ diff --git a/lib/esp32/libflite_g2p.a b/lib/esp32/libflite_g2p.a index e0faf3c..7d058a6 100644 Binary files a/lib/esp32/libflite_g2p.a and b/lib/esp32/libflite_g2p.a differ diff --git a/lib/esp32/libfst.a b/lib/esp32/libfst.a new file mode 100644 index 0000000..dc05523 Binary files /dev/null and b/lib/esp32/libfst.a differ diff --git a/lib/esp32/libhufzip.a b/lib/esp32/libhufzip.a new file mode 100644 index 0000000..12e82e4 Binary files /dev/null and b/lib/esp32/libhufzip.a differ diff --git a/lib/esp32/libmultinet.a b/lib/esp32/libmultinet.a index 024f5c9..a246897 100644 Binary files a/lib/esp32/libmultinet.a and b/lib/esp32/libmultinet.a differ diff --git a/lib/esp32/libnsnet.a b/lib/esp32/libnsnet.a new file mode 100644 index 0000000..ac93b40 Binary files /dev/null and b/lib/esp32/libnsnet.a differ diff --git a/lib/esp32/libvadnet.a b/lib/esp32/libvadnet.a new file mode 100644 index 0000000..84ce0e6 Binary files /dev/null and b/lib/esp32/libvadnet.a differ diff --git a/lib/esp32/libwakenet.a b/lib/esp32/libwakenet.a index cee6bd3..a54c0d0 100644 Binary files a/lib/esp32/libwakenet.a and b/lib/esp32/libwakenet.a differ diff --git a/lib/esp32p4/libc_speech_features.a b/lib/esp32p4/libc_speech_features.a index acb5e17..c99d6c3 100644 Binary files a/lib/esp32p4/libc_speech_features.a and b/lib/esp32p4/libc_speech_features.a differ diff --git a/lib/esp32p4/libdl_lib.a b/lib/esp32p4/libdl_lib.a index 664b727..e9a21be 100644 Binary files a/lib/esp32p4/libdl_lib.a and b/lib/esp32p4/libdl_lib.a differ diff --git a/lib/esp32p4/libesp_audio_front_end.a b/lib/esp32p4/libesp_audio_front_end.a index 0bcdd96..27faac9 100644 Binary files a/lib/esp32p4/libesp_audio_front_end.a and b/lib/esp32p4/libesp_audio_front_end.a differ diff --git a/lib/esp32p4/libesp_audio_processor.a b/lib/esp32p4/libesp_audio_processor.a index a4b6de2..2601a45 100644 Binary files a/lib/esp32p4/libesp_audio_processor.a and b/lib/esp32p4/libesp_audio_processor.a differ diff --git a/lib/esp32p4/libflite_g2p.a b/lib/esp32p4/libflite_g2p.a index 2a50345..7efb003 100644 Binary files a/lib/esp32p4/libflite_g2p.a and b/lib/esp32p4/libflite_g2p.a differ diff --git a/lib/esp32p4/libfst.a b/lib/esp32p4/libfst.a index 4956bbe..9b6ca4e 100644 Binary files a/lib/esp32p4/libfst.a and b/lib/esp32p4/libfst.a differ diff --git a/lib/esp32p4/libhufzip.a b/lib/esp32p4/libhufzip.a index 6dee63f..8b13e4f 100644 Binary files a/lib/esp32p4/libhufzip.a and b/lib/esp32p4/libhufzip.a differ diff --git a/lib/esp32p4/libmultinet.a b/lib/esp32p4/libmultinet.a index 1c73d70..408291f 100644 Binary files a/lib/esp32p4/libmultinet.a and b/lib/esp32p4/libmultinet.a differ diff --git a/lib/esp32p4/libnsnet.a b/lib/esp32p4/libnsnet.a index 2790d81..39ba6ff 100644 Binary files a/lib/esp32p4/libnsnet.a and b/lib/esp32p4/libnsnet.a differ diff --git a/lib/esp32p4/libvadnet.a b/lib/esp32p4/libvadnet.a index b654035..70114e5 100644 Binary files a/lib/esp32p4/libvadnet.a and b/lib/esp32p4/libvadnet.a differ diff --git a/lib/esp32p4/libwakenet.a b/lib/esp32p4/libwakenet.a index 4080869..67fe548 100644 Binary files a/lib/esp32p4/libwakenet.a and b/lib/esp32p4/libwakenet.a differ diff --git a/lib/esp32s3/libc_speech_features.a b/lib/esp32s3/libc_speech_features.a index 108af2e..46ee467 100644 Binary files a/lib/esp32s3/libc_speech_features.a and b/lib/esp32s3/libc_speech_features.a differ diff --git a/lib/esp32s3/libdl_lib.a b/lib/esp32s3/libdl_lib.a index 29525a6..f27412e 100644 Binary files a/lib/esp32s3/libdl_lib.a and b/lib/esp32s3/libdl_lib.a differ diff --git a/lib/esp32s3/libesp_audio_front_end.a b/lib/esp32s3/libesp_audio_front_end.a index 7c1a1cc..8452944 100644 Binary files a/lib/esp32s3/libesp_audio_front_end.a and b/lib/esp32s3/libesp_audio_front_end.a differ diff --git a/lib/esp32s3/libesp_audio_processor.a b/lib/esp32s3/libesp_audio_processor.a index a444b22..60fb949 100644 Binary files a/lib/esp32s3/libesp_audio_processor.a and b/lib/esp32s3/libesp_audio_processor.a differ diff --git a/lib/esp32s3/libmultinet.a b/lib/esp32s3/libmultinet.a index 319a43c..c26d920 100644 Binary files a/lib/esp32s3/libmultinet.a and b/lib/esp32s3/libmultinet.a differ diff --git a/lib/esp32s3/libnsnet.a b/lib/esp32s3/libnsnet.a index 7cca9b0..5621d1b 100644 Binary files a/lib/esp32s3/libnsnet.a and b/lib/esp32s3/libnsnet.a differ diff --git a/lib/esp32s3/libvadnet.a b/lib/esp32s3/libvadnet.a index e07fec7..685185f 100644 Binary files a/lib/esp32s3/libvadnet.a and b/lib/esp32s3/libvadnet.a differ diff --git a/lib/esp32s3/libwakenet.a b/lib/esp32s3/libwakenet.a index 16d6ec9..88f8846 100644 Binary files a/lib/esp32s3/libwakenet.a and b/lib/esp32s3/libwakenet.a differ diff --git a/model/vadnet_model/vadnet1_medium/_MODEL_INFO_ b/model/vadnet_model/vadnet1_medium/_MODEL_INFO_ index 5ba7d5f..9b9c6a7 100644 --- a/model/vadnet_model/vadnet1_medium/_MODEL_INFO_ +++ b/model/vadnet_model/vadnet1_medium/_MODEL_INFO_ @@ -1 +1 @@ -vadnet1_mediumv1_Speech_3_0.5_0.1 \ No newline at end of file +vadnet1_mediumv1_Speech_1_0.5_0.1 \ No newline at end of file diff --git a/src/esp_process_sdkconfig.c b/src/esp_process_sdkconfig.c index 626e195..329f458 100644 --- a/src/esp_process_sdkconfig.c +++ b/src/esp_process_sdkconfig.c @@ -958,4 +958,4 @@ end: esp_mn_commands_print(); return esp_mn_commands_update(); -} \ No newline at end of file +} diff --git a/test_apps/esp-sr/main/CMakeLists.txt b/test_apps/esp-sr/main/CMakeLists.txt index 8513954..d52b236 100644 --- a/test_apps/esp-sr/main/CMakeLists.txt +++ b/test_apps/esp-sr/main/CMakeLists.txt @@ -8,7 +8,7 @@ set(srcs idf_component_register(SRCS ${srcs} INCLUDE_DIRS "." "samples" - REQUIRES unity esp-sr + REQUIRES unity esp-sr esp_timer WHOLE_ARCHIVE) target_compile_options(${COMPONENT_LIB} PRIVATE "-Wno-format") diff --git a/test_apps/esp-sr/main/test_afe.cpp b/test_apps/esp-sr/main/test_afe.cpp index 975d5fd..ff20efe 100644 --- a/test_apps/esp-sr/main/test_afe.cpp +++ b/test_apps/esp-sr/main/test_afe.cpp @@ -12,7 +12,7 @@ #include #include "unity.h" #include "esp_log.h" - +#include "esp_timer.h" #include "model_path.h" #include "esp_wn_iface.h" #include "esp_wn_models.h" @@ -23,152 +23,187 @@ #if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4) #include "esp_nsn_models.h" #include "esp_nsn_iface.h" -#include "esp_vadn_models.h" -#include "esp_vadn_iface.h" #endif #define ARRAY_SIZE_OFFSET 8 // Increase this if audio_sys_get_real_time_stats returns ESP_ERR_INVALID_SIZE #define AUDIO_SYS_TASKS_ELAPSED_TIME_MS 1000 // Period of stats measurement static const char *TAG = "AFE_TEST"; -static volatile int s_cpu_test_task_flag = 0; -static esp_afe_sr_data_t *afe_data = NULL; - -static int total_ram_size_before = 0; -static int internal_ram_size_before = 0; -static int psram_size_before = 0; +static int detect_cnt = 0; +static int fetch_task_flag = 0; - -#if (CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID && CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS) -const static char *task_state[] = { - "Running", - "Ready", - "Blocked", - "Suspended", - "Deleted" -}; - -/** @brief -* "Extr": Allocated task stack from psram, "Intr": Allocated task stack from internel -*/ -const static char *task_stack[] = {"Extr", "Intr"}; -#endif - - -TEST_CASE(">>>>>>>> audio_front_end SR create/destroy API & memory leak <<<<<<<<", "[afe_sr]") +void test_afe_by_config(afe_config_t *afe_config, int frame_num, int* memory, float* cpu, int idx) { - int audio_chunksize = 0; - int16_t *feed_buff = NULL; + int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + int start_internal_size = heap_caps_get_free_size(MALLOC_CAP_INTERNAL); + int first_end_size = 0; + int end_size = 0; + int mem_leak = 0; + uint32_t feed_cpu_time = 0; + uint32_t fetch_cpu_time = 0; + uint32_t start=0, end = 0; + int loop = 3; + int feed_chunksize = 0; + int create_size = 0; + int create_internal_size = 0; - for (int aec_init = 0; aec_init < 2; aec_init++) { - for (int se_init = 0; se_init < 2; se_init++) { - for (int vad_init = 0; vad_init < 2; vad_init++) { - for (int wakenet_init = 0; wakenet_init < 2; wakenet_init++) { - printf("aec_init: %d, se_init: %d, vad_init: %d, wakenet_init: %d\n", aec_init, se_init, vad_init, wakenet_init); + for (int i=0; icreate_from_config(afe_config); - int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); - int start_internal_size = heap_caps_get_free_size(MALLOC_CAP_INTERNAL); - srmodel_list_t *models = esp_srmodel_init("model"); - char *model_name = esp_srmodel_filter(models, ESP_WN_PREFIX, NULL); - char *vad_model_name = NULL; -#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4) - vad_model_name = esp_srmodel_filter(models, ESP_VADN_PREFIX, NULL); -#endif + create_size = start_size - heap_caps_get_free_size(MALLOC_CAP_8BIT); + create_internal_size = start_internal_size - heap_caps_get_free_size(MALLOC_CAP_INTERNAL); - esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)&ESP_AFE_SR_HANDLE; - afe_config_t afe_config = AFE_CONFIG_DEFAULT(); - afe_config.aec_init = aec_init; - afe_config.se_init = se_init; - afe_config.vad_init = vad_init; - afe_config.wakenet_init = wakenet_init; - afe_config.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM; - afe_config.wakenet_model_name = model_name; - afe_config.voice_communication_init = false; - afe_config.vad_model_name = vad_model_name; - if (vad_model_name) { - printf("vad_model_name:%s\n", vad_model_name); - } + // run afe feed + feed_chunksize = afe_handle->get_feed_chunksize(afe_data); + int feed_nch = afe_handle->get_feed_channel_num(afe_data); - // test model loading time - struct timeval tv_start, tv_end; - gettimeofday(&tv_start, NULL); - afe_data = afe_handle->create_from_config(&afe_config); - gettimeofday(&tv_end, NULL); - int tv_ms = (tv_end.tv_sec - tv_start.tv_sec) * 1000 + (tv_end.tv_usec - tv_start.tv_usec) / 1000; - printf("create latency:%d ms\n", tv_ms); + int16_t *feed_buff = (int16_t *) malloc(feed_chunksize * sizeof(int16_t) * feed_nch); + start = esp_timer_get_time(); + for (int j=0; jfeed(afe_data, feed_buff); + } + end = esp_timer_get_time(); + feed_cpu_time += end - start; - // test model memory concumption - int create_size = start_size - heap_caps_get_free_size(MALLOC_CAP_8BIT); - int create_internal_size = start_internal_size - heap_caps_get_free_size(MALLOC_CAP_INTERNAL); - printf("Internal RAM: %d, PSRAM:%d\n", create_internal_size, create_size - create_internal_size); - afe_handle->destroy(afe_data); - esp_srmodel_deinit(models); + //run afe fetch + start = esp_timer_get_time(); + while(1) { + afe_fetch_result_t *res = afe_handle->fetch_with_delay(afe_data, 1 / portTICK_PERIOD_MS); + if (res->ret_value != ESP_OK) { + break; + } + } + end = esp_timer_get_time(); + fetch_cpu_time += end - start; + free(feed_buff); + afe_handle->destroy(afe_data); + end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); - // test memory leak - int first_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); - int last_end_size = first_end_size; - int mem_leak = start_size - last_end_size; - printf("create&destroy times:%d, memory leak:%d\n", 1, mem_leak); + if (i==0) { + first_end_size = end_size; + } + mem_leak = start_size - end_size; + ESP_LOGI(TAG, "create&destroy times:%d, memory leak:%d\n", i, mem_leak); + } + uint32_t feed_data_time = loop * frame_num * feed_chunksize / 16 * 1000; // us + memory[idx*2] = create_internal_size; + memory[idx*2+1] = create_size - create_internal_size; + cpu[idx*2] = feed_cpu_time*1.0/feed_data_time; + cpu[idx*2+1] = fetch_cpu_time*1.0/feed_data_time; + printf("Internal RAM: %d, PSRAM:%d, feed cpu loading:%f, fetch cpu loading:%f\n", + memory[idx*2], memory[idx*2+1], cpu[idx*2], cpu[idx*2+1]); + TEST_ASSERT_EQUAL(true, mem_leak < 1000 && end_size == first_end_size); +} - for (int i = 0; i < 6; i++) { - printf("init partition ...\n"); - models = esp_srmodel_init("model"); - model_name = esp_srmodel_filter(models, ESP_WN_PREFIX, NULL); -#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4) - vad_model_name = esp_srmodel_filter(models, ESP_VADN_PREFIX, NULL); -#endif - afe_config.wakenet_model_name = model_name; - afe_config.vad_model_name = vad_model_name; +TEST_CASE(">>>>>>>> AFE create/destroy API & memory leak <<<<<<<<", "[afe]") +{ + const char *input_format[6] = {"MR", "MMNR"}; + afe_type_t afe_type[2] = {AFE_TYPE_SR, AFE_TYPE_VC}; + afe_mode_t afe_mode[2] = {AFE_MODE_LOW_COST, AFE_MODE_HIGH_PERF}; + int count = 0; + int memory[512]; + float cpu[512]; - printf("create ...\n"); - afe_data = afe_handle->create_from_config(&afe_config); - - audio_chunksize = afe_handle->get_feed_chunksize(afe_data); - feed_buff = (int16_t *) malloc(audio_chunksize * sizeof(int16_t) * afe_config.pcm_config.total_ch_num); - assert(feed_buff); - - afe_handle->feed(afe_data, feed_buff); - printf("destroy ...\n"); - afe_handle->destroy(afe_data); - afe_data = NULL; - if (feed_buff) { - free(feed_buff); - feed_buff = NULL; + // test all setting + srmodel_list_t *models = esp_srmodel_init("model"); + for (int format_id=0; format_id<2; format_id++) { + for (int type_id=0; type_id<2; type_id++) { + for (int mode_id=0; mode_id<2; mode_id++) { + for (int aec_init = 0; aec_init < 2; aec_init++) { + for (int se_init = 0; se_init < 2; se_init++) { + for (int ns_init = 0; ns_init < 2; ns_init++) { + for (int vad_init = 0; vad_init < 2; vad_init++) { + for (int wakenet_init = 0; wakenet_init < 2; wakenet_init++) { + printf("format: %s, type: %d, mode: %d, memory size:%d %d\n", + input_format[format_id], afe_type[type_id], afe_mode[mode_id], heap_caps_get_free_size(MALLOC_CAP_8BIT), count); + afe_config_t *afe_config = afe_config_init(input_format[format_id], models, afe_type[type_id], afe_mode[mode_id]); + afe_config->aec_init = aec_init; + afe_config->se_init = se_init; + afe_config->ns_init = ns_init; + afe_config->vad_init = vad_init; + afe_config->wakenet_init = wakenet_init; + test_afe_by_config(afe_config, 4, memory, cpu, count); + afe_config_free(afe_config); + count++; + } + } } - esp_srmodel_deinit(models); - - vTaskDelay(100 / portTICK_PERIOD_MS); - last_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); - mem_leak = start_size - last_end_size; - printf("create&destroy times:%d, memory leak:%d\n", i + 2, mem_leak); } - - TEST_ASSERT_EQUAL(true, (mem_leak) < 1000 && last_end_size == first_end_size); } } } } + for (int idx=0; idx<256; idx++) { + printf("Internal RAM: %d, PSRAM:%d, feed cpu loading:%f, fetch cpu loading:%f\n", + memory[idx*2], memory[idx*2+1], cpu[idx*2], cpu[idx*2+1]); + } + printf("AFE create/destroy API & memory leak test done\n"); } +TEST_CASE(">>>>>>>> AFE default setting <<<<<<<<", "[afe_benchmark]") +{ + const char *input_format[6] = {"MR", "MMNR"}; + afe_type_t afe_type[2] = {AFE_TYPE_SR, AFE_TYPE_VC}; + afe_mode_t afe_mode[2] = {AFE_MODE_LOW_COST, AFE_MODE_HIGH_PERF}; + int count = 0; + int memory[16]; + float cpu[16]; + + // test all setting + srmodel_list_t *models = esp_srmodel_init("model"); + for (int format_id=0; format_id<2; format_id++) { + for (int type_id=0; type_id<2; type_id++) { + for (int mode_id=0; mode_id<2; mode_id++) { + printf("format: %s, type: %d, mode: %d, memory size:%d %d\n", + input_format[format_id], afe_type[type_id], afe_mode[mode_id], heap_caps_get_free_size(MALLOC_CAP_8BIT), count); + afe_config_t *afe_config = afe_config_init(input_format[format_id], models, afe_type[type_id], afe_mode[mode_id]); + test_afe_by_config(afe_config, 8, memory, cpu, count); + afe_config_free(afe_config); + count++; + } + } + } + count = 0; + for (int format_id=0; format_id<2; format_id++) { + for (int type_id=0; type_id<2; type_id++) { + for (int mode_id=0; mode_id<2; mode_id++) { + + printf("--------format: %s, type: %s, mode: %s------------\n", input_format[format_id], type_id==0? "SR": "VC", mode_id==0? "LOW_COST": "HIGH_PERF"); + printf("Internal RAM: %d, PSRAM:%d, feed cpu loading:%f, fetch cpu loading:%f\n", + memory[count*2], memory[count*2+1], cpu[count*2], cpu[count*2+1]); + count++; + } + } + } + printf("test done\n"); +} + + void test_feed_Task(void *arg) { - int sample_per_ms = 16; - // esp_afe_sr_iface_t *afe_handle = &ESP_AFE_SR_HANDLE; - esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)arg; + afe_task_into_t *afe_task_info = (afe_task_into_t *)arg; + esp_afe_sr_iface_t *afe_handle = afe_task_info->afe_handle; + esp_afe_sr_data_t *afe_data = afe_task_info->afe_data; + int feed_chunksize = afe_handle->get_feed_chunksize(afe_data); - int total_nch = afe_handle->get_total_channel_num(afe_data); - int16_t *i2s_buff = (int16_t *) malloc(feed_chunksize * sizeof(int16_t) * total_nch); + int feed_nch = afe_handle->get_feed_channel_num(afe_data); + int sample_per_ms = afe_handle->get_samp_rate(afe_data) / 1000; + int16_t *i2s_buff = (int16_t *) malloc(feed_chunksize * sizeof(int16_t) * feed_nch); assert(i2s_buff); ESP_LOGI(TAG, "feed task start\n"); - // FILE *fp = fopen("/sdcard/out", "w"); - // if (fp == NULL) printf("can not open file\n"); - - while (s_cpu_test_task_flag) { - // FatfsComboWrite(i2s_buff, audio_chunksize * I2S_CHANNEL_NUM * sizeof(int16_t), 1, fp); + int count = 0; + while (1) { + count ++; afe_handle->feed(afe_data, i2s_buff); vTaskDelay((feed_chunksize / sample_per_ms) / portTICK_PERIOD_MS); + if (count > 100) { + break; + } } if (i2s_buff) { free(i2s_buff); @@ -177,346 +212,89 @@ void test_feed_Task(void *arg) vTaskDelete(NULL); } -void test_detect_Task(void *arg) +void test_fetch_Task(void *arg) { // esp_afe_sr_iface_t *afe_handle = &ESP_AFE_SR_HANDLE; - esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)arg; - int fetch_chunksize = afe_handle->get_fetch_chunksize(afe_data); - int16_t *buff = (int16_t *) malloc(fetch_chunksize * sizeof(int16_t)); - assert(buff); - ESP_LOGI(TAG, "------------detect start------------\n"); - - // FILE *fp = fopen("/sdcard/out1", "w"); - // if (fp == NULL) printf("can not open file\n"); - - while (s_cpu_test_task_flag) { + afe_task_into_t *afe_task_info = (afe_task_into_t *)arg; + esp_afe_sr_iface_t *afe_handle = afe_task_info->afe_handle; + esp_afe_sr_data_t *afe_data = afe_task_info->afe_data; + detect_cnt = 0; + fetch_task_flag = 1; + while (1) { afe_fetch_result_t* res = afe_handle->fetch(afe_data); if (!res || res->ret_value == ESP_FAIL) { - printf("fetch error!\n"); break; } if (res->wakeup_state == WAKENET_DETECTED) { - ESP_LOGI(TAG, "wakeword detected\n"); - } - if (res->wakeup_state == WAKENET_CHANNEL_VERIFIED) { - ESP_LOGI(TAG, "AFE_FETCH_CHANNEL_VERIFIED\n"); + detect_cnt++; } } - if (buff) { - free(buff); - } + + // TEST_ASSERT_EQUAL(true, detect_cnt > 0); ESP_LOGI(TAG, "detect task quit\n"); + fetch_task_flag = 0; vTaskDelete(NULL); } -esp_err_t audio_sys_get_real_time_stats(void) +TEST_CASE("afe performance test (1ch)", "[afe_perf]") { -#if (CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID && CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS) - TaskStatus_t *start_array = NULL, *end_array = NULL; - UBaseType_t start_array_size, end_array_size; - uint32_t start_run_time, end_run_time; - uint32_t task_elapsed_time, percentage_time; - esp_err_t ret; + const char *input_format = "MR"; + afe_type_t afe_type = AFE_TYPE_VC; + afe_mode_t afe_model[2] = {AFE_MODE_HIGH_PERF, AFE_MODE_LOW_COST}; - // Allocate array to store current task states - start_array_size = uxTaskGetNumberOfTasks() + ARRAY_SIZE_OFFSET; - start_array = (TaskStatus_t*) malloc(sizeof(TaskStatus_t) * start_array_size); - assert(start_array); - // Get current task states - start_array_size = uxTaskGetSystemState(start_array, start_array_size, &start_run_time); - if (start_array_size == 0) { - ESP_LOGE(TAG, "Insufficient array size for uxTaskGetSystemState. Trying increasing ARRAY_SIZE_OFFSET"); - ret = ESP_FAIL; - if (start_array) { - free(start_array); - start_array = NULL; - } - if (end_array) { - free(end_array); - end_array = NULL; - } - return ret; - } + // test all setting + srmodel_list_t *models = esp_srmodel_init("model"); - vTaskDelay(pdMS_TO_TICKS(AUDIO_SYS_TASKS_ELAPSED_TIME_MS)); - - // Allocate array to store tasks states post delay - end_array_size = uxTaskGetNumberOfTasks() + ARRAY_SIZE_OFFSET; - end_array = (TaskStatus_t*) malloc(sizeof(TaskStatus_t) * end_array_size); - assert(end_array); - - // Get post delay task states - end_array_size = uxTaskGetSystemState(end_array, end_array_size, &end_run_time); - if (end_array_size == 0) { - ESP_LOGE(TAG, "Insufficient array size for uxTaskGetSystemState. Trying increasing ARRAY_SIZE_OFFSET"); - ret = ESP_FAIL; - if (start_array) { - free(start_array); - start_array = NULL; - } - if (end_array) { - free(end_array); - end_array = NULL; - } - return ret; - } - - // Calculate total_elapsed_time in units of run time stats clock period. - uint32_t total_elapsed_time = (end_run_time - start_run_time); - if (total_elapsed_time == 0) { - ESP_LOGE(TAG, "Delay duration too short. Trying increasing AUDIO_SYS_TASKS_ELAPSED_TIME_MS"); - ret = ESP_FAIL; - if (start_array) { - free(start_array); - start_array = NULL; - } - if (end_array) { - free(end_array); - end_array = NULL; - } - return ret; - } - - ESP_LOGI(TAG, "| Task | Run Time | Per | Prio | HWM | State | CoreId | Stack "); - - // Match each task in start_array to those in the end_array - for (int i = 0; i < start_array_size; i++) { - for (int j = 0; j < end_array_size; j++) { - if (start_array[i].xHandle == end_array[j].xHandle) { - - task_elapsed_time = end_array[j].ulRunTimeCounter - start_array[i].ulRunTimeCounter; - percentage_time = (task_elapsed_time * 100UL) / (total_elapsed_time * portNUM_PROCESSORS); - ESP_LOGI(TAG, "| %-17s | %-11d |%2d%% | %-4u | %-9u | %-7s | %-8x | %s", - start_array[i].pcTaskName, task_elapsed_time, percentage_time, start_array[i].uxCurrentPriority, - start_array[i].usStackHighWaterMark, task_state[(start_array[i].eCurrentState)], - start_array[i].xCoreID, task_stack[esp_ptr_internal(pxTaskGetStackStart(start_array[i].xHandle))]); - - // Mark that task have been matched by overwriting their handles - start_array[i].xHandle = NULL; - end_array[j].xHandle = NULL; - break; + for (int mode_id=0; mode_id<2; mode_id++) { + afe_config_t *afe_config = afe_config_init(input_format, models, afe_type, afe_model[mode_id]); + if (afe_config->wakenet_init && afe_config->wakenet_model_name) { + esp_afe_sr_iface_t *afe_handle = esp_afe_handle_from_config(afe_config); + esp_afe_sr_data_t *afe_data = afe_handle->create_from_config(afe_config); + afe_task_into_t task_info; + task_info.afe_data = afe_data; + task_info.afe_handle = afe_handle; + task_info.feed_task = NULL; + task_info.fetch_task = NULL; + fetch_task_flag = 1; + xTaskCreatePinnedToCore(test_feed_Task, "feed_task", 8 * 1024, (void *)(&task_info), 5, &task_info.feed_task, 0); + xTaskCreatePinnedToCore(test_fetch_Task, "fetch_task", 8 * 1024, (void *)(&task_info), 5, &task_info.fetch_task, 0); + while (fetch_task_flag) { + vTaskDelay(32 / portTICK_PERIOD_MS); } } + afe_config_free(afe_config); } - - // Print unmatched tasks - for (int i = 0; i < start_array_size; i++) { - if (start_array[i].xHandle != NULL) { - ESP_LOGI(TAG, "| %s | Deleted", start_array[i].pcTaskName); - } - } - for (int i = 0; i < end_array_size; i++) { - if (end_array[i].xHandle != NULL) { - ESP_LOGI(TAG, "| %s | Created", end_array[i].pcTaskName); - } - } - printf("\n"); - ret = ESP_OK; - - return ret; -#else - ESP_LOGW(TAG, "Please enbale `CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID` and `CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS` in menuconfig"); - return ESP_FAIL; -#endif -} - -void test_print_cpuloading(void *arg) -{ - while (s_cpu_test_task_flag) { - audio_sys_get_real_time_stats(); - int total_ram_size_after = heap_caps_get_free_size(MALLOC_CAP_8BIT); - int internal_ram_size_after = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL); - int psram_size_after = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM); - - ESP_LOGI(TAG, "total ram consume: %d KB", (total_ram_size_before - total_ram_size_after)/1024); - ESP_LOGI(TAG, "internal ram consume: %d KB", (internal_ram_size_before - internal_ram_size_after)/1024); - ESP_LOGI(TAG, "psram consume: %d KB\n\n", (psram_size_before - psram_size_after)/1024); - } - vTaskDelete(NULL); -} - -TEST_CASE("audio_front_end SR cpu loading and memory info", "[afe_sr]") -{ - srmodel_list_t *models = esp_srmodel_init("model"); - if (models!=NULL) { - for (int i=0; i < models->num; i++) { - printf("Load: %s\n", models->model_name[i]); - } - } - char *wn_name = esp_srmodel_filter(models, ESP_WN_PREFIX, NULL); - printf("wn_name: %s\n", wn_name); - - total_ram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT); - internal_ram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL); - psram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM); - - esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)&ESP_AFE_SR_HANDLE; - afe_config_t afe_config = AFE_CONFIG_DEFAULT(); - afe_config.wakenet_model_name = wn_name; - - afe_data = afe_handle->create_from_config(&afe_config); - if (!afe_data) { - printf("afe_data is null!\n"); - return; - } - - s_cpu_test_task_flag = 1; - xTaskCreatePinnedToCore(&test_feed_Task, "feed", 8 * 1024, (void *)afe_handle, 5, NULL, 0); - xTaskCreatePinnedToCore(&test_detect_Task, "detect", 8 * 1024, (void *)afe_handle, 5, NULL, 1); - xTaskCreatePinnedToCore(&test_print_cpuloading, "cpuloading", 4 * 1024, NULL, 5, NULL, 1); - - vTaskDelay(10000 / portTICK_PERIOD_MS); - s_cpu_test_task_flag = 0; - - vTaskDelay(2000 / portTICK_PERIOD_MS); - ESP_LOGI(TAG, "destroy\n"); - afe_handle->destroy(afe_data); - afe_data = NULL; esp_srmodel_deinit(models); - ESP_LOGI(TAG, "successful\n"); } - - -/******************************************** Divide VC Test ********************************************/ - - - -TEST_CASE("audio_front_end VC create/destroy API & memory leak", "[afe_vc]") +TEST_CASE("afe performance test (2ch)", "[afe_perf]") { - int start_total_mem_size = 0; - int start_internal_mem_size = 0; - int start_spiram_mem_size = 0; - int end_total_mem_size = 0; - int end_internal_mem_size = 0; - int end_spiram_mem_size = 0; + const char *input_format = "MMR"; + afe_type_t afe_type = AFE_TYPE_VC; + afe_mode_t afe_model[2] = {AFE_MODE_HIGH_PERF, AFE_MODE_LOW_COST}; - int audio_chunksize = 0; - int16_t *feed_buff = NULL; + // test all setting + srmodel_list_t *models = esp_srmodel_init("model"); - esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)&ESP_AFE_VC_HANDLE; - afe_config_t afe_config = AFE_CONFIG_DEFAULT(); - afe_config.wakenet_init = false; - afe_config.voice_communication_init = true; - - for (int aec_init = 0; aec_init < 2; aec_init++) { - for (int se_init = 0; se_init < 2; se_init++) { - for (int vad_init = 0; vad_init < 2; vad_init++) { - for (int voice_communication_agc_init = 0; voice_communication_agc_init < 2; voice_communication_agc_init++) { - #if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4) - for (int afe_ns_mode = 0; afe_ns_mode < 2; afe_ns_mode++) { - #else - int afe_ns_mode = NS_MODE_SSP; - #endif - printf("aec_init: %d, se_init: %d, vad_init: %d, voice_communication_agc_init: %d, afe_ns_mode: %d\n", aec_init, se_init, vad_init, voice_communication_agc_init, afe_ns_mode); - afe_config.aec_init = aec_init; - afe_config.se_init = se_init; - afe_config.vad_init = vad_init; - afe_config.voice_communication_agc_init = voice_communication_agc_init; - afe_config.afe_ns_mode = (afe_ns_mode_t)afe_ns_mode; - - //start_total_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); - //start_internal_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL); - //start_spiram_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM); - - for (int i = 0; i < 2; i++) { - printf("index: %d\n", i); - vTaskDelay(500 / portTICK_PERIOD_MS); - start_total_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); - start_internal_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL); - start_spiram_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM); - srmodel_list_t *models = esp_srmodel_init("model"); - char *nsnet_name = NULL; - #if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4) - nsnet_name = esp_srmodel_filter(models, ESP_NSNET_PREFIX, NULL); - #endif - printf("nsnet_name: %s\n", nsnet_name ? nsnet_name : ""); - afe_config.afe_ns_model_name = nsnet_name; - afe_data = afe_handle->create_from_config(&afe_config); - if (!afe_data) { - printf("afe_data is null\n"); - continue; - } - - audio_chunksize = afe_handle->get_feed_chunksize(afe_data); - feed_buff = (int16_t *) malloc(audio_chunksize * sizeof(int16_t) * afe_config.pcm_config.total_ch_num); - assert(feed_buff); - - afe_handle->feed(afe_data, feed_buff); - afe_handle->destroy(afe_data); - afe_data = NULL; - if (feed_buff) { - free(feed_buff); - feed_buff = NULL; - } - - esp_srmodel_deinit(models); - vTaskDelay(1000 / portTICK_PERIOD_MS); - end_total_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); - end_internal_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL); - end_spiram_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM); - - printf("memory leak: %d\n", start_total_mem_size - end_total_mem_size); - if (i > 0) { // skip index = 0 - TEST_ASSERT_EQUAL(start_internal_mem_size, end_internal_mem_size); - TEST_ASSERT_EQUAL(start_spiram_mem_size, end_spiram_mem_size); - TEST_ASSERT_EQUAL(start_total_mem_size, end_total_mem_size); - } else { - TEST_ASSERT_EQUAL(true, (start_total_mem_size - end_total_mem_size) < 1000); - } - } - #if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4) - } - #endif - } + for (int mode_id=0; mode_id<2; mode_id++) { + afe_config_t *afe_config = afe_config_init(input_format, models, afe_type, afe_model[mode_id]); + if (afe_config->wakenet_init && afe_config->wakenet_model_name) { + esp_afe_sr_iface_t *afe_handle = esp_afe_handle_from_config(afe_config); + esp_afe_sr_data_t *afe_data = afe_handle->create_from_config(afe_config); + afe_task_into_t task_info; + task_info.afe_data = afe_data; + task_info.afe_handle = afe_handle; + task_info.feed_task = NULL; + task_info.fetch_task = NULL; + fetch_task_flag = 1; + xTaskCreatePinnedToCore(&test_feed_Task, "feed_task", 8 * 1024, (void *)(&task_info), 5, &task_info.feed_task, 0); + xTaskCreatePinnedToCore(&test_fetch_Task, "fetch_task", 8 * 1024, (void *)(&task_info), 5, &task_info.fetch_task, 0); + while (fetch_task_flag) { + vTaskDelay(32 / portTICK_PERIOD_MS); } } + afe_config_free(afe_config); } -} - -TEST_CASE("audio_front_end VC cpu loading and memory info", "[afe_vc]") -{ - total_ram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT); - internal_ram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL); - psram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM); - srmodel_list_t *models = esp_srmodel_init("model"); - char *nsnet_name = NULL; -#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4) - nsnet_name = esp_srmodel_filter(models, ESP_NSNET_PREFIX, NULL); -#endif - printf("nsnet_name: %s\n", nsnet_name ? nsnet_name : ""); - - esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)&ESP_AFE_VC_HANDLE; - afe_config_t afe_config = AFE_CONFIG_DEFAULT(); - afe_config.wakenet_init = false; - afe_config.voice_communication_init = true; - afe_config.voice_communication_agc_init = true; -#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4) - afe_config.afe_ns_mode = NS_MODE_NET; -#else - afe_config.afe_ns_mode = NS_MODE_SSP; -#endif - afe_config.afe_ns_model_name = nsnet_name; - - afe_data = afe_handle->create_from_config(&afe_config); - if (!afe_data) { - printf("afe_data is null!\n"); - return; - } - - s_cpu_test_task_flag = 1; - xTaskCreatePinnedToCore(&test_feed_Task, "feed", 8 * 1024, (void *)afe_handle, 5, NULL, 0); - xTaskCreatePinnedToCore(&test_detect_Task, "detect", 8 * 1024, (void *)afe_handle, 5, NULL, 1); - xTaskCreatePinnedToCore(&test_print_cpuloading, "cpuloading", 4 * 1024, NULL, 5, NULL, 1); - - vTaskDelay(20000 / portTICK_PERIOD_MS); - s_cpu_test_task_flag = 0; - - vTaskDelay(2000 / portTICK_PERIOD_MS); - ESP_LOGI(TAG, "destroy\n"); - afe_handle->destroy(afe_data); esp_srmodel_deinit(models); - afe_data = NULL; - ESP_LOGI(TAG, "successful\n"); -} +} \ No newline at end of file diff --git a/test_apps/esp-sr/pytest_esp_sr.py b/test_apps/esp-sr/pytest_esp_sr.py index cfd3e41..85b584a 100644 --- a/test_apps/esp-sr/pytest_esp_sr.py +++ b/test_apps/esp-sr/pytest_esp_sr.py @@ -35,7 +35,6 @@ def test_multinet_p4(dut: Dut)-> None: @pytest.mark.parametrize( 'config', [ - 'mn5q8_en', 'wn9_hilexin', ], ) @@ -47,8 +46,7 @@ def test_wakenet(dut: Dut)-> None: @pytest.mark.parametrize( 'config', [ - 'p4_mn7_en', - 'p4_nsnet2', + 'p4_wn9_hilexin', ], ) def test_wakenet_p4(dut: Dut)-> None: @@ -59,44 +57,21 @@ def test_wakenet_p4(dut: Dut)-> None: @pytest.mark.parametrize( 'config', [ + 'afe', 'wn9_hilexin', - 'vadnet', ], ) def test_sr_afe(dut: Dut)-> None: - dut.run_all_single_board_cases(group="afe_sr", timeout=100000) + dut.run_all_single_board_cases(group="afe", timeout=3600) @pytest.mark.target('esp32p4') @pytest.mark.env('esp32p4') @pytest.mark.parametrize( 'config', [ - 'p4_mn7_cn', + 'p4_afe', + 'p4_wn9_hilexin', ], ) def test_sr_afe_p4(dut: Dut)-> None: - dut.run_all_single_board_cases(group="afe_sr", timeout=100000) - - -@pytest.mark.target('esp32s3') -@pytest.mark.env('esp32s3') -@pytest.mark.parametrize( - 'config', - [ - 'nsnet2', - ], -) -def test_vc_afe(dut: Dut)-> None: - dut.run_all_single_board_cases(group="afe_vc", timeout=100000) - - -@pytest.mark.target('esp32p4') -@pytest.mark.env('esp32p4') -@pytest.mark.parametrize( - 'config', - [ - 'p4_nsnet2', - ], -) -def test_vc_afe_p4(dut: Dut)-> None: - dut.run_all_single_board_cases(group="afe_vc", timeout=100000) \ No newline at end of file + dut.run_all_single_board_cases(group="afe", timeout=3600) diff --git a/test_apps/esp-sr/sdkconfig.ci.vadnet b/test_apps/esp-sr/sdkconfig.ci.afe similarity index 87% rename from test_apps/esp-sr/sdkconfig.ci.vadnet rename to test_apps/esp-sr/sdkconfig.ci.afe index 235a812..97bf120 100644 --- a/test_apps/esp-sr/sdkconfig.ci.vadnet +++ b/test_apps/esp-sr/sdkconfig.ci.afe @@ -2,20 +2,22 @@ # Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration # CONFIG_IDF_TARGET="esp32s3" -CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16 CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y CONFIG_SR_VADN_VADNET1_MEDIUM=y CONFIG_SR_WN_WN9_HILEXIN=y +CONFIG_SR_NSN_NSNET2=y CONFIG_SPIRAM=y +CONFIG_ESP_TASK_WDT_EN=n +CONFIG_ESP_TASK_WDT_INIT=n +CONFIG_ESP_MAIN_TASK_STACK_SIZE=10240 CONFIG_SPIRAM_MODE_OCT=y CONFIG_SPIRAM_SPEED_80M=y CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y CONFIG_ESP32S3_INSTRUCTION_CACHE_32KB=y CONFIG_ESP32S3_DATA_CACHE_64KB=y CONFIG_ESP32S3_DATA_CACHE_LINE_64B=y -CONFIG_ESP_MAIN_TASK_STACK_SIZE=8192 CONFIG_ESP_WIFI_GMAC_SUPPORT=n CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID=y CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS=y diff --git a/test_apps/esp-sr/sdkconfig.ci.mn2_cn b/test_apps/esp-sr/sdkconfig.ci.mn2_cn index 32d8eb1..17fb0d0 100644 --- a/test_apps/esp-sr/sdkconfig.ci.mn2_cn +++ b/test_apps/esp-sr/sdkconfig.ci.mn2_cn @@ -1,5 +1,5 @@ # This file was generated using idf.py save-defconfig. It can be edited manually. -# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration +# Espressif IoT Development Framework (ESP-IDF) 5.3.1 Project Minimal Configuration # CONFIG_IDF_TARGET="esp32" CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16 @@ -9,6 +9,10 @@ CONFIG_ESPTOOLPY_FLASHSIZE_8MB=y CONFIG_PARTITION_TABLE_CUSTOM=y CONFIG_PARTITION_TABLE_CUSTOM_FILENAME="partitions_esp32.csv" CONFIG_SR_MN_CN_MULTINET2_SINGLE_RECOGNITION=y +CONFIG_COMPILER_OPTIMIZATION_PERF=y +CONFIG_SPIRAM=y +CONFIG_SPIRAM_SPEED_80M=y +CONFIG_ESP_INT_WDT_TIMEOUT_MS=1000 CONFIG_ESP_WIFI_GMAC_SUPPORT=n CONFIG_LWIP_TCP_SND_BUF_DEFAULT=5744 CONFIG_LWIP_TCP_WND_DEFAULT=5744 diff --git a/test_apps/esp-sr/sdkconfig.ci.nsnet2 b/test_apps/esp-sr/sdkconfig.ci.nsnet2 deleted file mode 100644 index e421ae1..0000000 --- a/test_apps/esp-sr/sdkconfig.ci.nsnet2 +++ /dev/null @@ -1,23 +0,0 @@ -# This file was generated using idf.py save-defconfig. It can be edited manually. -# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration -# -CONFIG_IDF_TARGET="esp32s3" -CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16 -CONFIG_ESPTOOLPY_FLASHMODE_QIO=y -CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y -CONFIG_PARTITION_TABLE_CUSTOM=y -CONFIG_SR_NSN_NSNET2=y -CONFIG_SPIRAM=y -CONFIG_SPIRAM_MODE_OCT=y -CONFIG_SPIRAM_SPEED_80M=y -CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y -CONFIG_ESP32S3_INSTRUCTION_CACHE_32KB=y -CONFIG_ESP32S3_DATA_CACHE_64KB=y -CONFIG_ESP32S3_DATA_CACHE_LINE_64B=y -CONFIG_ESP_MAIN_TASK_STACK_SIZE=8192 -CONFIG_ESP_WIFI_GMAC_SUPPORT=n -CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID=y -CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS=y -CONFIG_LWIP_TCP_SND_BUF_DEFAULT=5744 -CONFIG_LWIP_TCP_WND_DEFAULT=5744 -CONFIG_UNITY_CRITICAL_LEAK_LEVEL_GENERAL=1024 diff --git a/test_apps/esp-sr/sdkconfig.ci.p4_afe b/test_apps/esp-sr/sdkconfig.ci.p4_afe new file mode 100644 index 0000000..7365041 --- /dev/null +++ b/test_apps/esp-sr/sdkconfig.ci.p4_afe @@ -0,0 +1,23 @@ +# This file was generated using idf.py save-defconfig. It can be edited manually. +# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration +# +CONFIG_IDF_TARGET="esp32p4" +CONFIG_ESPTOOLPY_FLASHMODE_QIO=y +CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y +CONFIG_PARTITION_TABLE_CUSTOM=y +CONFIG_SR_VADN_VADNET1_MEDIUM=y +CONFIG_SR_WN_WN9_HILEXIN=y +CONFIG_SR_NSN_NSNET2=y +CONFIG_SPIRAM=y +CONFIG_ESP_TASK_WDT_EN=n +CONFIG_ESP_TASK_WDT_INIT=n +CONFIG_ESP_MAIN_TASK_STACK_SIZE=10240 +CONFIG_COMPILER_OPTIMIZATION_PERF=y +CONFIG_ESP32P4_REV_MIN_0=y +CONFIG_SPIRAM=y +CONFIG_SPIRAM_SPEED_200M=y +CONFIG_CACHE_L2_CACHE_256KB=y +CONFIG_CACHE_L2_CACHE_LINE_128B=y +CONFIG_ESP_SYSTEM_ALLOW_RTC_FAST_MEM_AS_HEAP=n +CONFIG_MBEDTLS_CMAC_C=y +CONFIG_IDF_EXPERIMENTAL_FEATURES=y diff --git a/test_apps/esp-sr/sdkconfig.ci.p4_mn7_cn b/test_apps/esp-sr/sdkconfig.ci.p4_mn7_cn index 36f99e6..963498d 100644 --- a/test_apps/esp-sr/sdkconfig.ci.p4_mn7_cn +++ b/test_apps/esp-sr/sdkconfig.ci.p4_mn7_cn @@ -1,10 +1,12 @@ # This file was generated using idf.py save-defconfig. It can be edited manually. -# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration +# Espressif IoT Development Framework (ESP-IDF) 5.3.1 Project Minimal Configuration # CONFIG_IDF_TARGET="esp32p4" CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y +CONFIG_SR_NSN_NSNET2=y +CONFIG_SR_VADN_VADNET1_MEDIUM=y CONFIG_SR_WN_WN9_HILEXIN=y CONFIG_SR_MN_CN_MULTINET7_QUANT=y CONFIG_COMPILER_OPTIMIZATION_PERF=y @@ -14,7 +16,6 @@ CONFIG_SPIRAM_SPEED_200M=y CONFIG_CACHE_L2_CACHE_256KB=y CONFIG_CACHE_L2_CACHE_LINE_128B=y CONFIG_ESP_SYSTEM_ALLOW_RTC_FAST_MEM_AS_HEAP=n -CONFIG_ESP_MAIN_TASK_STACK_SIZE=8000 CONFIG_ESP_INT_WDT=n CONFIG_ESP_TASK_WDT_EN=n CONFIG_FREERTOS_HZ=1000 diff --git a/test_apps/esp-sr/sdkconfig.ci.p4_mn7_en b/test_apps/esp-sr/sdkconfig.ci.p4_mn7_en index c1f6ab6..9102ad8 100644 --- a/test_apps/esp-sr/sdkconfig.ci.p4_mn7_en +++ b/test_apps/esp-sr/sdkconfig.ci.p4_mn7_en @@ -1,10 +1,12 @@ # This file was generated using idf.py save-defconfig. It can be edited manually. -# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration +# Espressif IoT Development Framework (ESP-IDF) 5.3.1 Project Minimal Configuration # CONFIG_IDF_TARGET="esp32p4" CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y +CONFIG_SR_NSN_NSNET2=y +CONFIG_SR_VADN_VADNET1_MEDIUM=y CONFIG_SR_WN_WN9_HIESP=y CONFIG_SR_MN_EN_MULTINET7_QUANT=y CONFIG_COMPILER_OPTIMIZATION_PERF=y diff --git a/test_apps/esp-sr/sdkconfig.ci.p4_nsnet2 b/test_apps/esp-sr/sdkconfig.ci.p4_wn9_hilexin similarity index 73% rename from test_apps/esp-sr/sdkconfig.ci.p4_nsnet2 rename to test_apps/esp-sr/sdkconfig.ci.p4_wn9_hilexin index f9e7d8d..d9f287e 100644 --- a/test_apps/esp-sr/sdkconfig.ci.p4_nsnet2 +++ b/test_apps/esp-sr/sdkconfig.ci.p4_wn9_hilexin @@ -1,22 +1,20 @@ # This file was generated using idf.py save-defconfig. It can be edited manually. -# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration +# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration # CONFIG_IDF_TARGET="esp32p4" CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y -CONFIG_SR_WN_WN9_HIESP=y -CONFIG_SR_NSN_NSNET2=y +CONFIG_SR_WN_WN9_HILEXIN=y +CONFIG_SPIRAM=y +CONFIG_ESP_TASK_WDT_EN=n +CONFIG_ESP_TASK_WDT_INIT=n +CONFIG_ESP_MAIN_TASK_STACK_SIZE=10240 CONFIG_COMPILER_OPTIMIZATION_PERF=y CONFIG_ESP32P4_REV_MIN_0=y -CONFIG_SPIRAM=y CONFIG_SPIRAM_SPEED_200M=y CONFIG_CACHE_L2_CACHE_256KB=y CONFIG_CACHE_L2_CACHE_LINE_128B=y CONFIG_ESP_SYSTEM_ALLOW_RTC_FAST_MEM_AS_HEAP=n -CONFIG_ESP_MAIN_TASK_STACK_SIZE=10000 -CONFIG_ESP_INT_WDT=n -CONFIG_ESP_TASK_WDT_EN=n -CONFIG_FREERTOS_HZ=1000 CONFIG_MBEDTLS_CMAC_C=y CONFIG_IDF_EXPERIMENTAL_FEATURES=y diff --git a/test_apps/esp-sr/sdkconfig.ci.wn9_hilexin b/test_apps/esp-sr/sdkconfig.ci.wn9_hilexin index e002d55..d4174c9 100644 --- a/test_apps/esp-sr/sdkconfig.ci.wn9_hilexin +++ b/test_apps/esp-sr/sdkconfig.ci.wn9_hilexin @@ -2,13 +2,13 @@ # Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration # CONFIG_IDF_TARGET="esp32s3" -CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16 CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y CONFIG_SR_WN_WN9_HILEXIN=y -CONFIG_ESP_PHY_REDUCE_TX_POWER=y CONFIG_SPIRAM=y +CONFIG_ESP_TASK_WDT_EN=n +CONFIG_ESP_TASK_WDT_INIT=n CONFIG_SPIRAM_MODE_OCT=y CONFIG_SPIRAM_SPEED_80M=y CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y @@ -21,4 +21,4 @@ CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID=y CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS=y CONFIG_LWIP_TCP_SND_BUF_DEFAULT=5744 CONFIG_LWIP_TCP_WND_DEFAULT=5744 -CONFIG_UNITY_CRITICAL_LEAK_LEVEL_GENERAL=1024 +CONFIG_UNITY_CRITICAL_LEAK_LEVEL_GENERAL=1024 \ No newline at end of file diff --git a/test_apps/esp-sr/sdkconfig.defaults b/test_apps/esp-sr/sdkconfig.defaults new file mode 100644 index 0000000..ce9d0fb --- /dev/null +++ b/test_apps/esp-sr/sdkconfig.defaults @@ -0,0 +1,6 @@ +# This file was generated using idf.py save-defconfig. It can be edited manually. +# Espressif IoT Development Framework (ESP-IDF) 5.3.1 Project Minimal Configuration +# + +CONFIG_PARTITION_TABLE_CUSTOM=y +CONFIG_SR_VADN_VADNET1_MEDIUM=y \ No newline at end of file