diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index a649f77..16bef86 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -100,8 +100,6 @@ before_script: artifacts: when: always paths: - - "**/build*/size.json" - - "**/build*/build_log.txt" - "**/build*/*.bin" # upload to s3 server to save the artifacts size - "**/build*/*.map" diff --git a/CMakeLists.txt b/CMakeLists.txt index c405182..5f768f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,6 +74,7 @@ elseif(${IDF_TARGET} STREQUAL "esp32s3") add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME}) + add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME}) @@ -95,6 +96,7 @@ elseif(${IDF_TARGET} STREQUAL "esp32s3") esp_tts_chinese voice_set_xiaole nsnet + vadnet wakenet "-Wl,--end-group") @@ -153,6 +155,7 @@ elseif(${IDF_TARGET} STREQUAL "esp32p4") add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME}) + add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME}) @@ -173,6 +176,7 @@ elseif(${IDF_TARGET} STREQUAL "esp32p4") esp_tts_chinese voice_set_xiaole wakenet + vadnet nsnet "-Wl,--end-group") diff --git a/Kconfig.projbuild b/Kconfig.projbuild index dde6613..f6b3071 100644 --- a/Kconfig.projbuild +++ b/Kconfig.projbuild @@ -13,14 +13,9 @@ choice MODEL_DATA_PATH endchoice -config USE_AFE - bool "use afe" - default "y" - choice AFE_INTERFACE_SEL prompt "Afe interface" default AFE_INTERFACE_V1 - depends on USE_AFE help Select the afe interface to be used. @@ -29,306 +24,175 @@ choice AFE_INTERFACE_SEL endchoice -config USE_NSNET - bool "use nsnet" - default "n" - choice SR_NSN_MODEL_LOAD - prompt "Select deep noise suppression" - default SR_NSN_NSNET2 - depends on USE_NSNET + prompt "Select noise suppression model" + default SR_NSN_WEBRTC help - Select the deep noise suppression to be loaded. + Select the noise suppression model to be loaded. - config SR_NSN_NONE - bool "None" + config SR_NSN_WEBRTC + bool "noise suppression (WebRTC)" - config SR_NSN_NSNET1 - bool "Deep noise suppression v1 (nsnet1)" - depends on IDF_TARGET_ESP32S3 config SR_NSN_NSNET2 bool "Deep noise suppression v2 (nsnet2)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 endchoice -config USE_WAKENET - bool "use wakenet" - default "y" +choice SR_VADN_MODEL_LOAD + prompt "Select voice activity detection" + default SR_VADN_WEBRTC + help + Select the vad model to be loaded. + + config SR_VADN_WEBRTC + bool "voice activity detection (WebRTC)" + + config SR_VADN_VADNET1_MEDIUM + bool "voice activity detection (vadnet1 medium)" + depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 +endchoice choice SR_WN_MODEL_LOAD prompt "Select wake words" - default SR_WN_WN9_HILEXIN - depends on USE_WAKENET + default SR_WN_WN5_HILEXIN + depends on IDF_TARGET_ESP32 help Select the Wake Words to be loaded. config SR_WN_WN5_HILEXIN - bool "Hi,乐鑫 (wn5_hilexin)" - depends on IDF_TARGET_ESP32 + bool "Hi,Lexin (wn5_hilexin)" config SR_WN_WN5X3_HILEXIN - bool "Hi,乐鑫 (wn5_hilexinX3)" - depends on IDF_TARGET_ESP32 + bool "Hi,Lexin (wn5_hilexinX3)" config SR_WN_WN5_NIHAOXIAOZHI - bool "你好小智 (wn5_nihaoxiaozhi)" - depends on IDF_TARGET_ESP32 + bool "nihaoxiaozhi (wn5_nihaoxiaozhi)" config SR_WN_WN5X3_NIHAOXIAOZHI - bool "你好小智 (wn5_nihaoxiaozhiX3)" - depends on IDF_TARGET_ESP32 + bool "nihaoxiaozhi (wn5_nihaoxiaozhiX3)" config SR_WN_WN5X3_NIHAOXIAOXIN - bool "你好小鑫 (wn5_nihaoxiaoxinX3)" - depends on IDF_TARGET_ESP32 - - config SR_WN_WN8_ALEXA - bool "Alexa (wn8_alexa)" - depends on IDF_TARGET_ESP32S3 - - config SR_WN_WN9_HILEXIN - bool "Hi,乐鑫 (wn9_hilexin)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_XIAOAITONGXUE - bool "小爱同学 (wn9_xiaoaitongxue)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_ALEXA - bool "Alexa (wn9_alexa)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_HIESP - bool "Hi,ESP (wn9_hiesp)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_HIMFIVE - bool "Hi,M Five (wn9_himfive)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_NIHAOXIAOZHI_TTS - bool "你好小智 (wn9_nihaoxiaozhi_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_JARVIS_TTS - bool "Jarvis (wn9_jarvis_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_COMPUTER_TTS - bool "computer (wn9_computer_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_HEYWILLOW_TTS - bool "Hey,Willow (wn9_heywillow_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_SOPHIA_TTS - bool "Sophia (wn9_sophia_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_NIHAOXIAOXIN_TTS - bool "你好小鑫 (wn9_nihaoxiaoxin_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_XIAOMEITONGXUE_TTS - bool "小美同学 (wn9_xiaomeitongxue_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_HIXIAOXING_TTS - bool "Hi,小星 (wn9_hixiaoxing_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_MYCROFT_TTS - bool "Mycroft (wn9_mycroft_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_HEYPRINTER_TTS - bool "Hey,Printer (wn9_heyprinter_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_XIAOLONGXIAOLONG_TTS - bool "小龙小龙 (wn9_xiaolongxiaolong_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_MIAOMIAOTONGXUE_TTS - bool "喵喵同学 (wn9_miaomiaotongxue_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_HIJOY_TTS - bool "Hi,Joy (wn9_hijoy_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_HILILI_TTS - bool "Hi,Lily/Hi,莉莉 (wn9_hilili_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_HITELLY_TTS - bool "Hi,Telly/Hi,泰力 (wn9_hitelly_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_HEYWANDA_TTS - bool "Hey,Wanda (wn9_heywanda_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_HIMIAOMIAO_TTS - bool "Hi,喵喵 (wn9_himiaomiao_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_XIAOBINXIAOBIN_TTS - bool "小滨小滨/小冰小冰 (wn9_xiaobinxiaobin_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_HAIXIAOWU_TTS - bool "Hi,小巫 (wn9_haixiaowu_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_ASTROLABE_TTS - bool "Astrolabe (wn9_astrolabe_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_XIAOYAXIAOYA_TTS2 - bool "小鸭小鸭 (wn9_xiaoyaxiaoya_tts2)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_HIJASON_TTS2 - bool "Hi,Jason (wn9_hijason_tts2)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_LINAIBAN_TTS2 - bool "璃奈板 (wn9_linaiban_tts2)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_WN9_CUSTOMWORD - bool "customized word (wn9_customword)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - - config SR_WN_LOAD_MULIT_WORD - bool "Load Multiple Wake Words" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + bool "nihaoxiaoxin (wn5_nihaoxiaoxinX3)" endchoice menu "Load Multiple Wake Words" - depends on SR_WN_LOAD_MULIT_WORD + depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - config SR_WN_WN9_HILEXIN_MULTI + config SR_WN_WN9_HILEXIN bool "Hi,乐鑫 (wn9_hilexin)" default False - config SR_WN_WN9_XIAOAITONGXUE_MULTI + config SR_WN_WN9_XIAOAITONGXUE bool "小爱同学 (wn9_xiaoaitongxue)" default False - config SR_WN_WN9_NIHAOXIAOZHI_TTS_MULTI + config SR_WN_WN9_NIHAOXIAOZHI_TTS bool "你好小智 (wn9_nihaoxiaozhi_tts)" default False - config SR_WN_WN9_ALEXA_MULTI + config SR_WN_WN9_ALEXA bool "Alexa (wn9_alexa)" default False - config SR_WN_WN9_HIESP_MULTI + config SR_WN_WN9_HIESP bool "Hi,ESP (wn9_hiesp)" default False - config SR_WN_WN9_JARVIS_TTS_MULTI + config SR_WN_WN9_JARVIS_TTS bool "Jarvis (wn9_jarvis_tts)" default False - config SR_WN_WN9_COMPUTER_TTS_MULTI + config SR_WN_WN9_COMPUTER_TTS bool "computer (wn9_computer_tts)" default False - config SR_WN_WN9_HEYWILLOW_TTS_MULTI + config SR_WN_WN9_HEYWILLOW_TTS bool "Hey,Willow (wn9_heywillow_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_SOPHIA_TTS_MULTI + config SR_WN_WN9_SOPHIA_TTS bool "Sophia (wn9_sophia_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_NIHAOXIAOXIN_TTS_MULTI + config SR_WN_WN9_NIHAOXIAOXIN_TTS bool "你好小鑫 (wn9_nihaoxiaoxin_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_XIAOMEITONGXUE_TTS_MULTI + config SR_WN_WN9_XIAOMEITONGXUE_TTS bool "小美同学 (wn9_xiaomeitongxue_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_HEYPRINTER_TTS_MULTI + config SR_WN_WN9_HEYPRINTER_TTS bool "Hey,Printer (wn9_heyprinter_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_XIAOLONGXIAOLONG_TTS_MULTI + config SR_WN_WN9_XIAOLONGXIAOLONG_TTS bool "小龙小龙 (wn9_xiaolongxiaolong_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_MIAOMIAOTONGXUE_TTS_MULTI + config SR_WN_WN9_MIAOMIAOTONGXUE_TTS bool "喵喵同学 (wn9_miaomiaotongxue_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_HEYWANDA_TTS_MULTI + config SR_WN_WN9_HEYWANDA_TTS bool "Hey,Wanda (wn9_heywanda_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_HIMIAOMIAO_TTS_MULTI + config SR_WN_WN9_HIMIAOMIAO_TTS bool "Hi,喵喵 (wn9_himiaomiao_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_MYCROFT_TTS_MULTI + config SR_WN_WN9_MYCROFT_TTS bool "Mycroft (wn9_mycroft_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_HIJOY_TTS_MULTI + config SR_WN_WN9_HIJOY_TTS bool "Hi,Joy (wn9_hijoy_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_HILILI_TTS_MULTI + config SR_WN_WN9_HILILI_TTS bool "Hi,Lily/Hi,莉莉 (wn9_hilili_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_HITELLY_TTS_MULTI + config SR_WN_WN9_HITELLY_TTS bool "Hi,Telly/Hi,泰力 (wn9_hitelly_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_XIAOBINXIAOBIN_TTS_MULTI + config SR_WN_WN9_XIAOBINXIAOBIN_TTS bool "小滨小滨/小冰小冰 (wn9_xiaobinxiaobin_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_HAIXIAOWU_TTS_MULTI + config SR_WN_WN9_HAIXIAOWU_TTS bool "Hi,小巫 (wn9_haixiaowu_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_ASTROLABE_TTS_MULTI + config SR_WN_WN9_ASTROLABE_TTS bool "Astrolabe (wn9_astrolabe_tts)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_XIAOYAXIAOYA_TTS2_MULTI + config SR_WN_WN9_XIAOYAXIAOYA_TTS2 bool "小鸭小鸭 (wn9_xiaoyaxiaoya_tts2)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_HIJASON_TTS2_MULTI + config SR_WN_WN9_HIJASON_TTS2 bool "Hi,Jason (wn9_hijason_tts2)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False - config SR_WN_WN9_LINAIBAN_TTS2_MULTI + config SR_WN_WN9_LINAIBAN_TTS2 bool "璃奈板 (wn9_linaiban_tts2)" - depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + default False endmenu -config USE_MULTINET - bool "use multinet" - default "y" choice CHINESE_SR_MN_MODEL_SEL prompt "Chinese Speech Commands Model" - default SR_MN_CN_MULTINET6_QUANT - depends on USE_MULTINET + default SR_MN_CN_NONE help - Select the Wake Word Engine to be used. + Select the Chinese Speech Commands Model. config SR_MN_CN_NONE bool "None" @@ -362,9 +226,8 @@ endchoice choice ENGLISH_SR_MN_MODEL_SEL prompt "English Speech Commands Model" default SR_MN_EN_NONE - depends on USE_MULTINET help - Select the Wake Word Engine to be used. + Select the English Speech Commands Model. config SR_MN_EN_NONE bool "None" diff --git a/conftest.py b/conftest.py index 4c063b3..c6a3038 100644 --- a/conftest.py +++ b/conftest.py @@ -202,7 +202,7 @@ class IdfPytestEmbedded: for item in items: # default timeout 5 mins if 'timeout' not in item.keywords: - item.add_marker(pytest.mark.timeout(8 * 60)) + item.add_marker(pytest.mark.timeout(500 * 60)) # filter all the test cases with "--target" if self.target: diff --git a/include/esp32/dl_lib.h b/include/esp32/dl_lib.h index 63ba6da..47e7c86 100644 --- a/include/esp32/dl_lib.h +++ b/include/esp32/dl_lib.h @@ -78,7 +78,7 @@ void *dl_lib_calloc_psram(int cnt, int size, int align); /** * @brief Free aligned memory allocated by `dl_lib_calloc` or `dl_lib_calloc_psram` * - * @param prt Pointer to free + * @param ptr Pointer to free */ void dl_lib_free(void *ptr); @@ -415,4 +415,4 @@ dl_matrix2d_t *dl_basic_conv_layer_quantised_weight(const dl_matrix2d_t *in, con } #endif -#endif \ No newline at end of file +#endif diff --git a/include/esp32/dl_lib_convq8_queue.h b/include/esp32/dl_lib_convq8_queue.h index 0e53902..28c5da7 100644 --- a/include/esp32/dl_lib_convq8_queue.h +++ b/include/esp32/dl_lib_convq8_queue.h @@ -292,6 +292,7 @@ qtp_t *dl_atrous_conv1dq8_16_s3(dl_convq8_queue_t *in, dl_convq_queue_t *out, in void print_convq8(dl_convq8_queue_t *cq, int offset); void print_convq(dl_convq_queue_t *cq, int offset); +void dl_relu_convq8(dl_convq8_queue_t *cq); void lstmq8_free(void); diff --git a/include/esp32/dl_lib_convq_queue.h b/include/esp32/dl_lib_convq_queue.h index c71d5ca..ff190fe 100644 --- a/include/esp32/dl_lib_convq_queue.h +++ b/include/esp32/dl_lib_convq_queue.h @@ -279,9 +279,9 @@ dl_matrix2dq_t *dl_convq_lstm_layer(const dl_convq_queue_t *in, dl_convq_queue_t dl_matrix2dq_t *dl_basic_lstm_layer1_q(const dl_convq_queue_t *in, dl_matrix2dq_t *state_c, dl_matrix2dq_t *state_h, const dl_matrix2dq_t *weight, const dl_matrix2dq_t *bias, int step, int shift); -dl_matrix2dq_t *dl_convq16_lstm_layer(const dl_convq_queue_t *in, dl_convq_queue_t *out, dl_matrix2dq_t *state_c, - dl_matrix2dq_t *state_h, const dl_matrix2dq_t *in_weight, const dl_matrix2dq_t *h_weight, - const dl_matrix2dq_t *bias, int prenum); +dl_matrix2dq_t *dl_convq16_lstm_layer(dl_convq_queue_t *in, dl_convq_queue_t *out, dl_matrix2dq_t *state_c, + dl_matrix2dq_t *state_h, dl_matrix2dq_t *in_weight, dl_matrix2dq_t *h_weight, + dl_matrix2dq_t *bias, int prenum); /** * @brief Allocate a fixed-point multi channel convolution queue diff --git a/include/esp32/dl_lib_matrix.h b/include/esp32/dl_lib_matrix.h index b5fae74..59f7d79 100644 --- a/include/esp32/dl_lib_matrix.h +++ b/include/esp32/dl_lib_matrix.h @@ -25,10 +25,6 @@ extern "C" { #endif -// #ifdef CONFIG_IDF_TARGET_ESP32S3 -// #include "dl_tie728_bzero.h" -// #endif - typedef float fptp_t; #if CONFIG_BT_SHARE_MEM_REUSE diff --git a/include/esp32/esp_aec.h b/include/esp32/esp_aec.h index 03afc90..deb031c 100644 --- a/include/esp32/esp_aec.h +++ b/include/esp32/esp_aec.h @@ -23,7 +23,8 @@ extern "C" { #define USE_AEC_FFT // Not kiss_fft #define AEC_USE_SPIRAM 0 #define AEC_SAMPLE_RATE 16000 // Only Support 16000Hz -#define AEC_FRAME_LENGTH_MS 16 +//#define AEC_FRAME_LENGTH_MS 16 +#define AEC_FRAME_LENGTH_MS 32 #define AEC_FILTER_LENGTH 1200 // Number of samples of echo to cancel typedef void* aec_handle_t; diff --git a/include/esp32/esp_afe_config.h b/include/esp32/esp_afe_config.h index 702d859..c32689d 100644 --- a/include/esp32/esp_afe_config.h +++ b/include/esp32/esp_afe_config.h @@ -90,6 +90,12 @@ typedef struct { afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX]; afe_ns_mode_t afe_ns_mode; char *afe_ns_model_name; + bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone + // otherwise, select channel number by wakenet + char *vad_model_name; // The model name of vad, support vadnet1 and vadnet1_small + int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms + int vad_min_noise_ms; // The minimum duration of noise/silence in ms. It should be bigger than 64 ms + bool vad_mute_playback; // If true, the playback will be muted for vad detection } afe_config_t; @@ -123,6 +129,47 @@ typedef struct { .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \ .afe_ns_mode = NS_MODE_SSP, \ .afe_ns_model_name = NULL, \ + .fixed_first_channel = true, \ + .vad_model_name = NULL, \ + .vad_min_speech_ms = 64, \ + .vad_min_noise_ms = 256, \ + .vad_mute_playback = false, \ +} +#elif CONFIG_IDF_TARGET_ESP32P4 +#define AFE_CONFIG_DEFAULT() { \ + .aec_init = true, \ + .se_init = true, \ + .vad_init = true, \ + .wakenet_init = true, \ + .voice_communication_init = false, \ + .voice_communication_agc_init = false, \ + .voice_communication_agc_gain = 15, \ + .vad_mode = VAD_MODE_3, \ + .wakenet_model_name = NULL, \ + .wakenet_model_name_2 = NULL, \ + .wakenet_mode = DET_MODE_90, \ + .afe_mode = SR_MODE_LOW_COST, \ + .afe_perferred_core = 0, \ + .afe_perferred_priority = 5, \ + .afe_ringbuf_size = 50, \ + .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \ + .afe_linear_gain = 1.0, \ + .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \ + .pcm_config = { \ + .total_ch_num = 2, \ + .mic_num = 1, \ + .ref_num = 1, \ + .sample_rate = 16000, \ + }, \ + .debug_init = false, \ + .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \ + .afe_ns_mode = NS_MODE_SSP, \ + .afe_ns_model_name = NULL, \ + .fixed_first_channel = true, \ + .vad_model_name = NULL, \ + .vad_min_speech_ms = 64, \ + .vad_min_noise_ms = 256, \ + .vad_mute_playback = false, \ } #elif CONFIG_IDF_TARGET_ESP32S3 #define AFE_CONFIG_DEFAULT() { \ @@ -154,6 +201,11 @@ typedef struct { .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \ .afe_ns_mode = NS_MODE_SSP, \ .afe_ns_model_name = NULL, \ + .fixed_first_channel = true, \ + .vad_model_name = NULL, \ + .vad_min_speech_ms = 64, \ + .vad_min_noise_ms = 256, \ + .vad_mute_playback = false, \ } #endif diff --git a/include/esp32/esp_afe_sr_iface.h b/include/esp32/esp_afe_sr_iface.h index daf5b92..84d7000 100644 --- a/include/esp32/esp_afe_sr_iface.h +++ b/include/esp32/esp_afe_sr_iface.h @@ -29,6 +29,8 @@ typedef struct afe_fetch_result_t { int16_t *data; // the data of audio. int data_size; // the size of data. The unit is byte. + int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated. + int vad_cache_size; // the size of vad_cache. The unit is byte. float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc). // if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length. wakenet_state_t wakeup_state; // the value is wakenet_state_t @@ -36,7 +38,7 @@ typedef struct afe_fetch_result_t int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1. afe_vad_state_t vad_state; // the value is afe_vad_state_t int trigger_channel_id; // the channel index of output - int wake_word_length; // the length of wake word. It's unit is the number of samples. + int wake_word_length; // the length of wake word. The unit is the number of samples. int ret_value; // the return state of fetch function void* reserved; // reserved for future use } afe_fetch_result_t; @@ -112,7 +114,7 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *af * @brief reset ringbuf of AFE. * * @param afe The AFE_SR object to query - * @return -1: fail, 0: success + * @return -1: fail, 1: success */ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe); @@ -122,7 +124,7 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe); * * @param afe The AFE_SR object to query * @param wakenet_word The wakenet word, should be DEFAULT_WAKE_WORD or EXTRA_WAKE_WORD - * @return 0: fail, 1: success + * @return -1: fail, 1: success */ typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name); @@ -130,7 +132,7 @@ typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* m * @brief Disable wakenet model. * * @param afe The AFE_SR object to query - * @return 0: fail, 1: success + * @return -1: fail, 0: disabled, 1: enabled */ typedef int (*esp_afe_sr_iface_op_disable_wakenet_t)(esp_afe_sr_data_t *afe); @@ -138,7 +140,7 @@ typedef int (*esp_afe_sr_iface_op_disable_wakenet_t)(esp_afe_sr_data_t *afe); * @brief Enable wakenet model. * * @param afe The AFE_SR object to query - * @return 0: fail, 1: success + * @return -1: fail, 0: disabled, 1: enabled */ typedef int (*esp_afe_sr_iface_op_enable_wakenet_t)(esp_afe_sr_data_t *afe); @@ -146,7 +148,7 @@ typedef int (*esp_afe_sr_iface_op_enable_wakenet_t)(esp_afe_sr_data_t *afe); * @brief Disable AEC algorithm. * * @param afe The AFE_SR object to query - * @return 0: fail, 1: success + * @return -1: fail, 0: disabled, 1: enabled */ typedef int (*esp_afe_sr_iface_op_disable_aec_t)(esp_afe_sr_data_t *afe); @@ -154,7 +156,7 @@ typedef int (*esp_afe_sr_iface_op_disable_aec_t)(esp_afe_sr_data_t *afe); * @brief Enable AEC algorithm. * * @param afe The AFE_SR object to query - * @return 0: fail, 1: success + * @return -1: fail, 0: disabled, 1: enabled */ typedef int (*esp_afe_sr_iface_op_enable_aec_t)(esp_afe_sr_data_t *afe); @@ -162,7 +164,7 @@ typedef int (*esp_afe_sr_iface_op_enable_aec_t)(esp_afe_sr_data_t *afe); * @brief Disable SE algorithm. * * @param afe The AFE_SR object to query - * @return 0: fail, 1: success + * @return -1: fail, 0: disabled, 1: enabled */ typedef int (*esp_afe_sr_iface_op_disable_se_t)(esp_afe_sr_data_t *afe); @@ -170,7 +172,7 @@ typedef int (*esp_afe_sr_iface_op_disable_se_t)(esp_afe_sr_data_t *afe); * @brief Enable SE algorithm. * * @param afe The AFE_SR object to query - * @return 0: fail, 1: success + * @return -1: fail, 0: disabled, 1: enabled */ typedef int (*esp_afe_sr_iface_op_enable_se_t)(esp_afe_sr_data_t *afe); diff --git a/include/esp32/esp_afe_sr_models.h b/include/esp32/esp_afe_sr_models.h index feaad43..39de63f 100644 --- a/include/esp32/esp_afe_sr_models.h +++ b/include/esp32/esp_afe_sr_models.h @@ -4,7 +4,6 @@ extern "C" { #endif -#if defined CONFIG_USE_AFE #include "esp_afe_sr_iface.h" @@ -19,17 +18,6 @@ extern const esp_afe_sr_iface_t esp_afe_vc_v1; #endif -#else - - -#include "esp_afe_sr_iface.h" -extern const esp_afe_sr_iface_t esp_afe_sr_v1; -extern const esp_afe_sr_iface_t esp_afe_vc_v1; -#define ESP_AFE_SR_HANDLE esp_afe_sr_v1 -#define ESP_AFE_VC_HANDLE esp_afe_vc_v1 - -#endif - #ifdef __cplusplus } #endif \ No newline at end of file diff --git a/include/esp32/esp_nsn_models.h b/include/esp32/esp_nsn_models.h index 0a7e334..8165e27 100644 --- a/include/esp32/esp_nsn_models.h +++ b/include/esp32/esp_nsn_models.h @@ -2,8 +2,16 @@ #include "esp_nsn_iface.h" -// The prefix of nsnet model name is used to filter all wakenet from availabel models. +/* +The prefix of nset +Now there are nsnet1 and nsnet2 +*/ #define ESP_NSNET_PREFIX "nsnet" -extern const esp_nsn_iface_t esp_nsnet1_quantized; -#define ESP_NSN_HANDLE esp_nsnet1_quantized \ No newline at end of file +/** + * @brief Get the nsnet handle from model name + * + * @param model_name The name of model + * @returns The handle of multinet + */ +esp_nsn_iface_t *esp_nsnet_handle_from_name(char *model_name); diff --git a/include/esp32/esp_vad.h b/include/esp32/esp_vad.h index 2440d39..90f8e20 100644 --- a/include/esp32/esp_vad.h +++ b/include/esp32/esp_vad.h @@ -25,22 +25,65 @@ extern "C" { /** * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more - * restrictive in reporting speech. + * restrictive in reporting speech. So If you want trigger more speech, please select lower mode. */ typedef enum { - VAD_MODE_0 = 0, - VAD_MODE_1, - VAD_MODE_2, - VAD_MODE_3, - VAD_MODE_4 + VAD_MODE_0 = 0, // Normal + VAD_MODE_1, // Aggressive + VAD_MODE_2, // Very Aggressive + VAD_MODE_3, // Very Very Aggressive + VAD_MODE_4 // Very Very Very Aggressive } vad_mode_t; typedef enum { VAD_SILENCE = 0, - VAD_SPEECH + VAD_SPEECH = 1, } vad_state_t; -typedef void* vad_handle_t; +typedef struct vad_trigger_tag { + vad_state_t state; + unsigned int min_speech_len; + unsigned int noise_len; + unsigned int min_noise_len; + unsigned int speech_len; +} vad_trigger_t; + +#define vad_MAX_LEN INT32_MAX - 1 +/** + * @brief Allocate wakenet trigger + * + * @param min_speech_len Minimum frame number of speech duration + * @param min_noise_len Minimum frame number of noise duration + * + * @return Trigger pointer + **/ +vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len); + +/** + * @brief Free wakenet trigger + **/ +void vad_trigger_free(vad_trigger_t *trigger); + +/** + * @brief Reset wakenet trigger + **/ +void vad_trigger_reset(vad_trigger_t *trigger); + +/** + * @brief detect activaty voice by trigger + **/ +vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state); + + +typedef struct { + vad_trigger_t *trigger; + void *vad_inst; +}vad_handle_with_trigger_t; + +typedef vad_handle_with_trigger_t* vad_handle_t; + +// typedef vad_handle_tag * vad_handle_t; + /** * @brief Creates an instance to the VAD structure. @@ -53,6 +96,18 @@ typedef void* vad_handle_t; */ vad_handle_t vad_create(vad_mode_t vad_mode); +/** + * @brief Creates an instance to the VAD structure. + * + * @param vad_mode Sets the VAD operating mode. + * @param min_speech_len Minimum frame number of speech duration + * @param min_noise_len Minimum frame number of noise duration + * @return + * - NULL: Create failed + * - Others: The instance of VAD + */ +vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len); + /** * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. * diff --git a/include/esp32/flite_g2p.h b/include/esp32/flite_g2p.h index 55aeaa6..e91425e 100644 --- a/include/esp32/flite_g2p.h +++ b/include/esp32/flite_g2p.h @@ -9,7 +9,7 @@ typedef struct { void flite_g2p_result_free(flite_g2p_result *result); -flite_g2p_result *flite_g2p_get_result(char *grapheme); +flite_g2p_result *flite_g2p_get_result(const char *grapheme); void flite_g2p_result_print_string(flite_g2p_result *result, int map_phonemes); diff --git a/include/esp32p4/esp_afe_config.h b/include/esp32p4/esp_afe_config.h index 6cac4c6..c32689d 100644 --- a/include/esp32p4/esp_afe_config.h +++ b/include/esp32p4/esp_afe_config.h @@ -92,6 +92,10 @@ typedef struct { char *afe_ns_model_name; bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone // otherwise, select channel number by wakenet + char *vad_model_name; // The model name of vad, support vadnet1 and vadnet1_small + int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms + int vad_min_noise_ms; // The minimum duration of noise/silence in ms. It should be bigger than 64 ms + bool vad_mute_playback; // If true, the playback will be muted for vad detection } afe_config_t; @@ -126,6 +130,10 @@ typedef struct { .afe_ns_mode = NS_MODE_SSP, \ .afe_ns_model_name = NULL, \ .fixed_first_channel = true, \ + .vad_model_name = NULL, \ + .vad_min_speech_ms = 64, \ + .vad_min_noise_ms = 256, \ + .vad_mute_playback = false, \ } #elif CONFIG_IDF_TARGET_ESP32P4 #define AFE_CONFIG_DEFAULT() { \ @@ -158,6 +166,10 @@ typedef struct { .afe_ns_mode = NS_MODE_SSP, \ .afe_ns_model_name = NULL, \ .fixed_first_channel = true, \ + .vad_model_name = NULL, \ + .vad_min_speech_ms = 64, \ + .vad_min_noise_ms = 256, \ + .vad_mute_playback = false, \ } #elif CONFIG_IDF_TARGET_ESP32S3 #define AFE_CONFIG_DEFAULT() { \ @@ -190,6 +202,10 @@ typedef struct { .afe_ns_mode = NS_MODE_SSP, \ .afe_ns_model_name = NULL, \ .fixed_first_channel = true, \ + .vad_model_name = NULL, \ + .vad_min_speech_ms = 64, \ + .vad_min_noise_ms = 256, \ + .vad_mute_playback = false, \ } #endif diff --git a/include/esp32p4/esp_afe_sr_iface.h b/include/esp32p4/esp_afe_sr_iface.h index 0b52ea4..84d7000 100644 --- a/include/esp32p4/esp_afe_sr_iface.h +++ b/include/esp32p4/esp_afe_sr_iface.h @@ -29,6 +29,8 @@ typedef struct afe_fetch_result_t { int16_t *data; // the data of audio. int data_size; // the size of data. The unit is byte. + int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated. + int vad_cache_size; // the size of vad_cache. The unit is byte. float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc). // if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length. wakenet_state_t wakeup_state; // the value is wakenet_state_t @@ -36,7 +38,7 @@ typedef struct afe_fetch_result_t int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1. afe_vad_state_t vad_state; // the value is afe_vad_state_t int trigger_channel_id; // the channel index of output - int wake_word_length; // the length of wake word. It's unit is the number of samples. + int wake_word_length; // the length of wake word. The unit is the number of samples. int ret_value; // the return state of fetch function void* reserved; // reserved for future use } afe_fetch_result_t; diff --git a/include/esp32p4/esp_afe_sr_models.h b/include/esp32p4/esp_afe_sr_models.h index feaad43..39de63f 100644 --- a/include/esp32p4/esp_afe_sr_models.h +++ b/include/esp32p4/esp_afe_sr_models.h @@ -4,7 +4,6 @@ extern "C" { #endif -#if defined CONFIG_USE_AFE #include "esp_afe_sr_iface.h" @@ -19,17 +18,6 @@ extern const esp_afe_sr_iface_t esp_afe_vc_v1; #endif -#else - - -#include "esp_afe_sr_iface.h" -extern const esp_afe_sr_iface_t esp_afe_sr_v1; -extern const esp_afe_sr_iface_t esp_afe_vc_v1; -#define ESP_AFE_SR_HANDLE esp_afe_sr_v1 -#define ESP_AFE_VC_HANDLE esp_afe_vc_v1 - -#endif - #ifdef __cplusplus } #endif \ No newline at end of file diff --git a/include/esp32p4/esp_vad.h b/include/esp32p4/esp_vad.h index 2440d39..90f8e20 100644 --- a/include/esp32p4/esp_vad.h +++ b/include/esp32p4/esp_vad.h @@ -25,22 +25,65 @@ extern "C" { /** * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more - * restrictive in reporting speech. + * restrictive in reporting speech. So If you want trigger more speech, please select lower mode. */ typedef enum { - VAD_MODE_0 = 0, - VAD_MODE_1, - VAD_MODE_2, - VAD_MODE_3, - VAD_MODE_4 + VAD_MODE_0 = 0, // Normal + VAD_MODE_1, // Aggressive + VAD_MODE_2, // Very Aggressive + VAD_MODE_3, // Very Very Aggressive + VAD_MODE_4 // Very Very Very Aggressive } vad_mode_t; typedef enum { VAD_SILENCE = 0, - VAD_SPEECH + VAD_SPEECH = 1, } vad_state_t; -typedef void* vad_handle_t; +typedef struct vad_trigger_tag { + vad_state_t state; + unsigned int min_speech_len; + unsigned int noise_len; + unsigned int min_noise_len; + unsigned int speech_len; +} vad_trigger_t; + +#define vad_MAX_LEN INT32_MAX - 1 +/** + * @brief Allocate wakenet trigger + * + * @param min_speech_len Minimum frame number of speech duration + * @param min_noise_len Minimum frame number of noise duration + * + * @return Trigger pointer + **/ +vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len); + +/** + * @brief Free wakenet trigger + **/ +void vad_trigger_free(vad_trigger_t *trigger); + +/** + * @brief Reset wakenet trigger + **/ +void vad_trigger_reset(vad_trigger_t *trigger); + +/** + * @brief detect activaty voice by trigger + **/ +vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state); + + +typedef struct { + vad_trigger_t *trigger; + void *vad_inst; +}vad_handle_with_trigger_t; + +typedef vad_handle_with_trigger_t* vad_handle_t; + +// typedef vad_handle_tag * vad_handle_t; + /** * @brief Creates an instance to the VAD structure. @@ -53,6 +96,18 @@ typedef void* vad_handle_t; */ vad_handle_t vad_create(vad_mode_t vad_mode); +/** + * @brief Creates an instance to the VAD structure. + * + * @param vad_mode Sets the VAD operating mode. + * @param min_speech_len Minimum frame number of speech duration + * @param min_noise_len Minimum frame number of noise duration + * @return + * - NULL: Create failed + * - Others: The instance of VAD + */ +vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len); + /** * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. * diff --git a/include/esp32p4/esp_vadn_iface.h b/include/esp32p4/esp_vadn_iface.h new file mode 100644 index 0000000..1ec8bb9 --- /dev/null +++ b/include/esp32p4/esp_vadn_iface.h @@ -0,0 +1,142 @@ +#pragma once +#include "esp_vad.h" +#include "stdint.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Opaque model data container +typedef struct model_iface_data_t model_iface_data_t; + +// /** +// * @brief The state of vad +// */ +// typedef enum { +// VAD_NOISE = -1, // Noise +// VADNET_STATE_SILENCE = 0, // Silence +// VAD_SPEECH = 1 // Speech +// } vad_state_t; + +/** + * @brief Easy function type to initialze a model instance with a detection mode + * and specified model name + * + * @param model_name The specified model name + * @param mode The voice activity detection mode + * @param channel_num The number of input audio channels + * @param min_speech_ms The minimum duration of speech in ms to trigger vad + * speech + * @param min_noise_ms The minimum duration of noise in ms to trigger vad + * noise + * @returns Handle to the model data + */ +typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)( + const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms); + +/** + * @brief Get the amount of samples that need to be passed to the detect + * function + * + * Every speech recognition model processes a certain number of samples at the + * same time. This function can be used to query that amount. Note that the + * returned amount is in 16-bit samples, not in bytes. + * + * @param model The model object to query + * @return The amount of samples to feed the detect function + */ +typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model); + +/** + * @brief Get the channel number of samples that need to be passed to the detect + * function + * + * Every speech recognition model processes a certain number of samples at the + * same time. This function can be used to query that amount. Note that the + * returned amount is in 16-bit samples, not in bytes. + * + * @param model The model object to query + * @return The amount of samples to feed the detect function + */ +typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model); + +/** + * @brief Get the sample rate of the samples to feed to the detect function + * + * @param model The model object to query + * @return The sample rate, in hz + */ +typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model); + +/** + * @brief Set the detection threshold to manually abjust the probability + * + * @param model The model object to query + * @param det_treshold The threshold to trigger wake words, the range of + * det_threshold is 0.5~0.9999 + * @return 0: setting failed, 1: setting success + */ +typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold); + +/** + * @brief Get the voice activity detection threshold + * + * @param model The model object to query + * @returns the detection threshold + */ +typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model); + +/** + * @brief Feed samples of an audio stream to the vad model and detect whether is + * voice. + * + * @param model The model object to query + * @param samples An array of 16-bit signed audio samples. The array size used + * can be queried by the get_samp_chunksize function. + * @return The index of wake words, return 0 if no wake word is detected, else + * the index of the wake words. + */ +typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples); + +/** + * @brief Get the triggered channel index. Channel index starts from zero + * + * @param model The model object to query + * @return The channel index + */ +typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model); + +/** + * @brief Clean all states of model + * + * @param model The model object to query + */ +typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model); + +/** + * @brief Destroy a model object + * + * @param model Model object to destroy + */ +typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model); + +/** + * This structure contains the functions used to do operations on a voice + * activity detection model. + */ +typedef struct { + esp_vadn_iface_op_create_t create; + esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize; + esp_vadn_iface_op_get_channel_num_t get_channel_num; + esp_vadn_iface_op_get_samp_rate_t get_samp_rate; + esp_vadn_iface_op_set_det_threshold_t set_det_threshold; + esp_vadn_iface_op_get_det_threshold_t get_det_threshold; + esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel; + esp_vadn_iface_op_detect_t detect; + esp_vadn_iface_op_clean_t clean; + esp_vadn_iface_op_destroy_t destroy; +} esp_vadn_iface_t; + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/include/esp32p4/esp_vadn_models.h b/include/esp32p4/esp_vadn_models.h new file mode 100644 index 0000000..eadc55f --- /dev/null +++ b/include/esp32p4/esp_vadn_models.h @@ -0,0 +1,22 @@ +#pragma once +#include "esp_vadn_iface.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// The prefix of vadnet model name is used to filter all wakenet from availabel models. +#define ESP_VADN_PREFIX "vadnet" + +/** + * @brief Get the wakenet handle from model name + * + * @param model_name The name of model + * @returns The handle of wakenet + */ +const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name); + + +#ifdef __cplusplus +} +#endif diff --git a/include/esp32s3/esp_afe_config.h b/include/esp32s3/esp_afe_config.h index 6cac4c6..5f70735 100644 --- a/include/esp32s3/esp_afe_config.h +++ b/include/esp32s3/esp_afe_config.h @@ -92,6 +92,10 @@ typedef struct { char *afe_ns_model_name; bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone // otherwise, select channel number by wakenet + char *vad_model_name; // The model name of vad, support vadnet1 and vadnet1_small + int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms + int vad_min_noise_ms; // The minimum duration of noise/silence in ms. It should be bigger than 64 ms + bool vad_mute_playback; // If true, the playback will be muted for vad detection } afe_config_t; @@ -104,7 +108,7 @@ typedef struct { .voice_communication_init = false, \ .voice_communication_agc_init = false, \ .voice_communication_agc_gain = 15, \ - .vad_mode = VAD_MODE_3, \ + .vad_mode = VAD_MODE_0, \ .wakenet_model_name = NULL, \ .wakenet_model_name_2 = NULL, \ .wakenet_mode = DET_MODE_90, \ @@ -126,6 +130,10 @@ typedef struct { .afe_ns_mode = NS_MODE_SSP, \ .afe_ns_model_name = NULL, \ .fixed_first_channel = true, \ + .vad_model_name = NULL, \ + .vad_min_speech_ms = 64, \ + .vad_min_noise_ms = 256, \ + .vad_mute_playback = false, \ } #elif CONFIG_IDF_TARGET_ESP32P4 #define AFE_CONFIG_DEFAULT() { \ @@ -136,7 +144,7 @@ typedef struct { .voice_communication_init = false, \ .voice_communication_agc_init = false, \ .voice_communication_agc_gain = 15, \ - .vad_mode = VAD_MODE_3, \ + .vad_mode = VAD_MODE_0, \ .wakenet_model_name = NULL, \ .wakenet_model_name_2 = NULL, \ .wakenet_mode = DET_MODE_90, \ @@ -158,6 +166,10 @@ typedef struct { .afe_ns_mode = NS_MODE_SSP, \ .afe_ns_model_name = NULL, \ .fixed_first_channel = true, \ + .vad_model_name = NULL, \ + .vad_min_speech_ms = 64, \ + .vad_min_noise_ms = 256, \ + .vad_mute_playback = false, \ } #elif CONFIG_IDF_TARGET_ESP32S3 #define AFE_CONFIG_DEFAULT() { \ @@ -168,7 +180,7 @@ typedef struct { .voice_communication_init = false, \ .voice_communication_agc_init = false, \ .voice_communication_agc_gain = 15, \ - .vad_mode = VAD_MODE_3, \ + .vad_mode = VAD_MODE_0, \ .wakenet_model_name = NULL, \ .wakenet_model_name_2 = NULL, \ .wakenet_mode = DET_MODE_2CH_90, \ @@ -190,6 +202,10 @@ typedef struct { .afe_ns_mode = NS_MODE_SSP, \ .afe_ns_model_name = NULL, \ .fixed_first_channel = true, \ + .vad_model_name = NULL, \ + .vad_min_speech_ms = 64, \ + .vad_min_noise_ms = 256, \ + .vad_mute_playback = false, \ } #endif diff --git a/include/esp32s3/esp_afe_sr_iface.h b/include/esp32s3/esp_afe_sr_iface.h index 0b52ea4..84d7000 100644 --- a/include/esp32s3/esp_afe_sr_iface.h +++ b/include/esp32s3/esp_afe_sr_iface.h @@ -29,6 +29,8 @@ typedef struct afe_fetch_result_t { int16_t *data; // the data of audio. int data_size; // the size of data. The unit is byte. + int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated. + int vad_cache_size; // the size of vad_cache. The unit is byte. float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc). // if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length. wakenet_state_t wakeup_state; // the value is wakenet_state_t @@ -36,7 +38,7 @@ typedef struct afe_fetch_result_t int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1. afe_vad_state_t vad_state; // the value is afe_vad_state_t int trigger_channel_id; // the channel index of output - int wake_word_length; // the length of wake word. It's unit is the number of samples. + int wake_word_length; // the length of wake word. The unit is the number of samples. int ret_value; // the return state of fetch function void* reserved; // reserved for future use } afe_fetch_result_t; diff --git a/include/esp32s3/esp_afe_sr_models.h b/include/esp32s3/esp_afe_sr_models.h index feaad43..39de63f 100644 --- a/include/esp32s3/esp_afe_sr_models.h +++ b/include/esp32s3/esp_afe_sr_models.h @@ -4,7 +4,6 @@ extern "C" { #endif -#if defined CONFIG_USE_AFE #include "esp_afe_sr_iface.h" @@ -19,17 +18,6 @@ extern const esp_afe_sr_iface_t esp_afe_vc_v1; #endif -#else - - -#include "esp_afe_sr_iface.h" -extern const esp_afe_sr_iface_t esp_afe_sr_v1; -extern const esp_afe_sr_iface_t esp_afe_vc_v1; -#define ESP_AFE_SR_HANDLE esp_afe_sr_v1 -#define ESP_AFE_VC_HANDLE esp_afe_vc_v1 - -#endif - #ifdef __cplusplus } #endif \ No newline at end of file diff --git a/include/esp32s3/esp_vad.h b/include/esp32s3/esp_vad.h index 2440d39..90f8e20 100644 --- a/include/esp32s3/esp_vad.h +++ b/include/esp32s3/esp_vad.h @@ -25,22 +25,65 @@ extern "C" { /** * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more - * restrictive in reporting speech. + * restrictive in reporting speech. So If you want trigger more speech, please select lower mode. */ typedef enum { - VAD_MODE_0 = 0, - VAD_MODE_1, - VAD_MODE_2, - VAD_MODE_3, - VAD_MODE_4 + VAD_MODE_0 = 0, // Normal + VAD_MODE_1, // Aggressive + VAD_MODE_2, // Very Aggressive + VAD_MODE_3, // Very Very Aggressive + VAD_MODE_4 // Very Very Very Aggressive } vad_mode_t; typedef enum { VAD_SILENCE = 0, - VAD_SPEECH + VAD_SPEECH = 1, } vad_state_t; -typedef void* vad_handle_t; +typedef struct vad_trigger_tag { + vad_state_t state; + unsigned int min_speech_len; + unsigned int noise_len; + unsigned int min_noise_len; + unsigned int speech_len; +} vad_trigger_t; + +#define vad_MAX_LEN INT32_MAX - 1 +/** + * @brief Allocate wakenet trigger + * + * @param min_speech_len Minimum frame number of speech duration + * @param min_noise_len Minimum frame number of noise duration + * + * @return Trigger pointer + **/ +vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len); + +/** + * @brief Free wakenet trigger + **/ +void vad_trigger_free(vad_trigger_t *trigger); + +/** + * @brief Reset wakenet trigger + **/ +void vad_trigger_reset(vad_trigger_t *trigger); + +/** + * @brief detect activaty voice by trigger + **/ +vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state); + + +typedef struct { + vad_trigger_t *trigger; + void *vad_inst; +}vad_handle_with_trigger_t; + +typedef vad_handle_with_trigger_t* vad_handle_t; + +// typedef vad_handle_tag * vad_handle_t; + /** * @brief Creates an instance to the VAD structure. @@ -53,6 +96,18 @@ typedef void* vad_handle_t; */ vad_handle_t vad_create(vad_mode_t vad_mode); +/** + * @brief Creates an instance to the VAD structure. + * + * @param vad_mode Sets the VAD operating mode. + * @param min_speech_len Minimum frame number of speech duration + * @param min_noise_len Minimum frame number of noise duration + * @return + * - NULL: Create failed + * - Others: The instance of VAD + */ +vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len); + /** * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. * diff --git a/include/esp32s3/esp_vadn_iface.h b/include/esp32s3/esp_vadn_iface.h new file mode 100644 index 0000000..1ec8bb9 --- /dev/null +++ b/include/esp32s3/esp_vadn_iface.h @@ -0,0 +1,142 @@ +#pragma once +#include "esp_vad.h" +#include "stdint.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Opaque model data container +typedef struct model_iface_data_t model_iface_data_t; + +// /** +// * @brief The state of vad +// */ +// typedef enum { +// VAD_NOISE = -1, // Noise +// VADNET_STATE_SILENCE = 0, // Silence +// VAD_SPEECH = 1 // Speech +// } vad_state_t; + +/** + * @brief Easy function type to initialze a model instance with a detection mode + * and specified model name + * + * @param model_name The specified model name + * @param mode The voice activity detection mode + * @param channel_num The number of input audio channels + * @param min_speech_ms The minimum duration of speech in ms to trigger vad + * speech + * @param min_noise_ms The minimum duration of noise in ms to trigger vad + * noise + * @returns Handle to the model data + */ +typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)( + const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms); + +/** + * @brief Get the amount of samples that need to be passed to the detect + * function + * + * Every speech recognition model processes a certain number of samples at the + * same time. This function can be used to query that amount. Note that the + * returned amount is in 16-bit samples, not in bytes. + * + * @param model The model object to query + * @return The amount of samples to feed the detect function + */ +typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model); + +/** + * @brief Get the channel number of samples that need to be passed to the detect + * function + * + * Every speech recognition model processes a certain number of samples at the + * same time. This function can be used to query that amount. Note that the + * returned amount is in 16-bit samples, not in bytes. + * + * @param model The model object to query + * @return The amount of samples to feed the detect function + */ +typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model); + +/** + * @brief Get the sample rate of the samples to feed to the detect function + * + * @param model The model object to query + * @return The sample rate, in hz + */ +typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model); + +/** + * @brief Set the detection threshold to manually abjust the probability + * + * @param model The model object to query + * @param det_treshold The threshold to trigger wake words, the range of + * det_threshold is 0.5~0.9999 + * @return 0: setting failed, 1: setting success + */ +typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold); + +/** + * @brief Get the voice activity detection threshold + * + * @param model The model object to query + * @returns the detection threshold + */ +typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model); + +/** + * @brief Feed samples of an audio stream to the vad model and detect whether is + * voice. + * + * @param model The model object to query + * @param samples An array of 16-bit signed audio samples. The array size used + * can be queried by the get_samp_chunksize function. + * @return The index of wake words, return 0 if no wake word is detected, else + * the index of the wake words. + */ +typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples); + +/** + * @brief Get the triggered channel index. Channel index starts from zero + * + * @param model The model object to query + * @return The channel index + */ +typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model); + +/** + * @brief Clean all states of model + * + * @param model The model object to query + */ +typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model); + +/** + * @brief Destroy a model object + * + * @param model Model object to destroy + */ +typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model); + +/** + * This structure contains the functions used to do operations on a voice + * activity detection model. + */ +typedef struct { + esp_vadn_iface_op_create_t create; + esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize; + esp_vadn_iface_op_get_channel_num_t get_channel_num; + esp_vadn_iface_op_get_samp_rate_t get_samp_rate; + esp_vadn_iface_op_set_det_threshold_t set_det_threshold; + esp_vadn_iface_op_get_det_threshold_t get_det_threshold; + esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel; + esp_vadn_iface_op_detect_t detect; + esp_vadn_iface_op_clean_t clean; + esp_vadn_iface_op_destroy_t destroy; +} esp_vadn_iface_t; + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/include/esp32s3/esp_vadn_models.h b/include/esp32s3/esp_vadn_models.h new file mode 100644 index 0000000..eadc55f --- /dev/null +++ b/include/esp32s3/esp_vadn_models.h @@ -0,0 +1,22 @@ +#pragma once +#include "esp_vadn_iface.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// The prefix of vadnet model name is used to filter all wakenet from availabel models. +#define ESP_VADN_PREFIX "vadnet" + +/** + * @brief Get the wakenet handle from model name + * + * @param model_name The name of model + * @returns The handle of wakenet + */ +const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name); + + +#ifdef __cplusplus +} +#endif diff --git a/lib/esp32/libc_speech_features.a b/lib/esp32/libc_speech_features.a index 4cfc32c..a105141 100644 Binary files a/lib/esp32/libc_speech_features.a and b/lib/esp32/libc_speech_features.a differ diff --git a/lib/esp32/libdl_lib.a b/lib/esp32/libdl_lib.a index 173deb4..97717e0 100644 Binary files a/lib/esp32/libdl_lib.a and b/lib/esp32/libdl_lib.a differ diff --git a/lib/esp32/libesp_audio_front_end.a b/lib/esp32/libesp_audio_front_end.a index cd719c4..41d45b4 100644 Binary files a/lib/esp32/libesp_audio_front_end.a and b/lib/esp32/libesp_audio_front_end.a differ diff --git a/lib/esp32/libesp_audio_processor.a b/lib/esp32/libesp_audio_processor.a index 71d2ef1..8cdf8cf 100644 Binary files a/lib/esp32/libesp_audio_processor.a and b/lib/esp32/libesp_audio_processor.a differ diff --git a/lib/esp32/libmultinet.a b/lib/esp32/libmultinet.a index 526f735..024f5c9 100644 Binary files a/lib/esp32/libmultinet.a and b/lib/esp32/libmultinet.a differ diff --git a/lib/esp32/libwakenet.a b/lib/esp32/libwakenet.a index 81960a7..cee6bd3 100644 Binary files a/lib/esp32/libwakenet.a and b/lib/esp32/libwakenet.a differ diff --git a/lib/esp32/libwakeword_model.a b/lib/esp32/libwakeword_model.a index 44714b8..b17e140 100644 Binary files a/lib/esp32/libwakeword_model.a and b/lib/esp32/libwakeword_model.a differ diff --git a/lib/esp32p4/libdl_lib.a b/lib/esp32p4/libdl_lib.a index f6c1fda..664b727 100644 Binary files a/lib/esp32p4/libdl_lib.a and b/lib/esp32p4/libdl_lib.a differ diff --git a/lib/esp32p4/libesp_audio_front_end.a b/lib/esp32p4/libesp_audio_front_end.a index 6ea5129..0bcdd96 100644 Binary files a/lib/esp32p4/libesp_audio_front_end.a and b/lib/esp32p4/libesp_audio_front_end.a differ diff --git a/lib/esp32p4/libesp_audio_processor.a b/lib/esp32p4/libesp_audio_processor.a index 730fb62..a4b6de2 100644 Binary files a/lib/esp32p4/libesp_audio_processor.a and b/lib/esp32p4/libesp_audio_processor.a differ diff --git a/lib/esp32p4/libmultinet.a b/lib/esp32p4/libmultinet.a index e31499d..1c73d70 100644 Binary files a/lib/esp32p4/libmultinet.a and b/lib/esp32p4/libmultinet.a differ diff --git a/lib/esp32p4/libvadnet.a b/lib/esp32p4/libvadnet.a new file mode 100644 index 0000000..b654035 Binary files /dev/null and b/lib/esp32p4/libvadnet.a differ diff --git a/lib/esp32p4/libwakenet.a b/lib/esp32p4/libwakenet.a index a10f40a..4080869 100644 Binary files a/lib/esp32p4/libwakenet.a and b/lib/esp32p4/libwakenet.a differ diff --git a/lib/esp32s3/libc_speech_features.a b/lib/esp32s3/libc_speech_features.a index 1cd372e..108af2e 100644 Binary files a/lib/esp32s3/libc_speech_features.a and b/lib/esp32s3/libc_speech_features.a differ diff --git a/lib/esp32s3/libdl_lib.a b/lib/esp32s3/libdl_lib.a index 21626fa..29525a6 100644 Binary files a/lib/esp32s3/libdl_lib.a and b/lib/esp32s3/libdl_lib.a differ diff --git a/lib/esp32s3/libesp_audio_front_end.a b/lib/esp32s3/libesp_audio_front_end.a index ed917cb..7c1a1cc 100644 Binary files a/lib/esp32s3/libesp_audio_front_end.a and b/lib/esp32s3/libesp_audio_front_end.a differ diff --git a/lib/esp32s3/libesp_audio_processor.a b/lib/esp32s3/libesp_audio_processor.a index 8e8db4e..a444b22 100644 Binary files a/lib/esp32s3/libesp_audio_processor.a and b/lib/esp32s3/libesp_audio_processor.a differ diff --git a/lib/esp32s3/libflite_g2p.a b/lib/esp32s3/libflite_g2p.a index 76538e2..6a99a57 100644 Binary files a/lib/esp32s3/libflite_g2p.a and b/lib/esp32s3/libflite_g2p.a differ diff --git a/lib/esp32s3/libfst.a b/lib/esp32s3/libfst.a index 086a928..a2dd373 100644 Binary files a/lib/esp32s3/libfst.a and b/lib/esp32s3/libfst.a differ diff --git a/lib/esp32s3/libhufzip.a b/lib/esp32s3/libhufzip.a index b790f14..c0465b1 100644 Binary files a/lib/esp32s3/libhufzip.a and b/lib/esp32s3/libhufzip.a differ diff --git a/lib/esp32s3/libmultinet.a b/lib/esp32s3/libmultinet.a index b7418f8..319a43c 100644 Binary files a/lib/esp32s3/libmultinet.a and b/lib/esp32s3/libmultinet.a differ diff --git a/lib/esp32s3/libnsnet.a b/lib/esp32s3/libnsnet.a index 3b00050..7cca9b0 100644 Binary files a/lib/esp32s3/libnsnet.a and b/lib/esp32s3/libnsnet.a differ diff --git a/lib/esp32s3/libvadnet.a b/lib/esp32s3/libvadnet.a new file mode 100644 index 0000000..e07fec7 Binary files /dev/null and b/lib/esp32s3/libvadnet.a differ diff --git a/lib/esp32s3/libwakenet.a b/lib/esp32s3/libwakenet.a index 82c5c27..16d6ec9 100644 Binary files a/lib/esp32s3/libwakenet.a and b/lib/esp32s3/libwakenet.a differ diff --git a/model/movemodel.py b/model/movemodel.py index b49aa8a..e3bb4e0 100644 --- a/model/movemodel.py +++ b/model/movemodel.py @@ -28,6 +28,8 @@ def copy_wakenet_from_sdkconfig(model_path, sdkconfig_path, target_path): for label in f: label = label.strip("\n") if 'CONFIG_SR_WN' in label and '#' not in label[0]: + if '_NONE' in label: + continue if '=' in label: label = label.split("=")[0] if '_MULTI' in label: @@ -113,13 +115,13 @@ def copy_vadnet_from_sdkconfig(model_path, sdkconfig_path, target_path): models_string = '' for label in f: label = label.strip("\n") - if 'CONFIG_SR_VADNET' in label and label[0] != '#': + if 'CONFIG_SR_VADN' in label and label[0] != '#': models_string += label models = [] - if "CONFIG_SR_VADNET_MODLE_SMALL" in models_string: + if "CONFIG_SR_VADN_VADNET1_SMALL" in models_string: models.append('vadnet1_small') - elif "CONFIG_SR_VADNET_MODLE_MEDIUM" in models_string: + elif "CONFIG_SR_VADN_VADNET1_MEDIUM" in models_string: models.append('vadnet1_medium') for item in models: diff --git a/model/vadnet_model/vadnet1_medium/_MODEL_INFO_ b/model/vadnet_model/vadnet1_medium/_MODEL_INFO_ new file mode 100644 index 0000000..5ba7d5f --- /dev/null +++ b/model/vadnet_model/vadnet1_medium/_MODEL_INFO_ @@ -0,0 +1 @@ +vadnet1_mediumv1_Speech_3_0.5_0.1 \ No newline at end of file diff --git a/model/vadnet_model/vadnet1_medium/vadn1_data b/model/vadnet_model/vadnet1_medium/vadn1_data new file mode 100644 index 0000000..55c694e Binary files /dev/null and b/model/vadnet_model/vadnet1_medium/vadn1_data differ diff --git a/model/vadnet_model/vadnet1_medium/vadn1_index b/model/vadnet_model/vadnet1_medium/vadn1_index new file mode 100644 index 0000000..9ce8fa7 Binary files /dev/null and b/model/vadnet_model/vadnet1_medium/vadn1_index differ diff --git a/model/wakenet_model/wn9_nihaodameng.zip b/model/wakenet_model/wn9_nihaodameng.zip new file mode 100644 index 0000000..e33779c Binary files /dev/null and b/model/wakenet_model/wn9_nihaodameng.zip differ diff --git a/model/wakenet_model/wn9_nihaodameng/_MODEL_INFO_ b/model/wakenet_model/wn9_nihaodameng/_MODEL_INFO_ new file mode 100644 index 0000000..6d28a56 --- /dev/null +++ b/model/wakenet_model/wn9_nihaodameng/_MODEL_INFO_ @@ -0,0 +1 @@ +wakenet9l_tts2h12_你好达蒙_3_0.634_0.640 diff --git a/model/wakenet_model/wn9_nihaodameng/wn9_data b/model/wakenet_model/wn9_nihaodameng/wn9_data new file mode 100644 index 0000000..1b13ec6 Binary files /dev/null and b/model/wakenet_model/wn9_nihaodameng/wn9_data differ diff --git a/model/wakenet_model/wn9_nihaodameng/wn9_index b/model/wakenet_model/wn9_nihaodameng/wn9_index new file mode 100644 index 0000000..5e7c881 Binary files /dev/null and b/model/wakenet_model/wn9_nihaodameng/wn9_index differ diff --git a/src/esp_process_sdkconfig.c b/src/esp_process_sdkconfig.c index 1fe6d09..626e195 100644 --- a/src/esp_process_sdkconfig.c +++ b/src/esp_process_sdkconfig.c @@ -57,8 +57,16 @@ void check_chip_config(void) ESP_LOGW(TAG, "PSRAM freq should be 200MHz"); #endif +#ifdef CONFIG_ESP32P4_DATA_CACHE_128KB + ESP_LOGW(TAG, "Recommend data cache larger than 128KB"); +#endif + +#ifdef CONFIG_ESP32P4_DATA_CACHE_LINE_64B + ESP_LOGW(TAG, "Recommend data cache line larger than 64B"); +#endif + #else - ESP_LOGW(TAG, "ESP-SR-AFE only support ESP32/ESP32S3"); + ESP_LOGW(TAG, "ESP-SR-AFE only support ESP32/ESP32S3/ESP32P4"); #endif } @@ -476,7 +484,7 @@ char *get_id_name_cn(int i) char *get_id_name_en(int i) { -#if defined CONFIG_USE_MULTINET && defined CONFIG_SR_MN_EN_MULTINET5_SINGLE_RECOGNITION_QUANT8 +#if CONFIG_SR_MN_EN_MULTINET5_SINGLE_RECOGNITION_QUANT8 if (i == 0) { return CONFIG_EN_SPEECH_COMMAND_ID0; } else if (i == 1) { diff --git a/test_apps/esp-sr/main/test_afe.cpp b/test_apps/esp-sr/main/test_afe.cpp index 35fa630..975d5fd 100644 --- a/test_apps/esp-sr/main/test_afe.cpp +++ b/test_apps/esp-sr/main/test_afe.cpp @@ -23,6 +23,8 @@ #if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4) #include "esp_nsn_models.h" #include "esp_nsn_iface.h" +#include "esp_vadn_models.h" +#include "esp_vadn_iface.h" #endif #define ARRAY_SIZE_OFFSET 8 // Increase this if audio_sys_get_real_time_stats returns ESP_ERR_INVALID_SIZE @@ -69,6 +71,10 @@ TEST_CASE(">>>>>>>> audio_front_end SR create/destroy API & memory leak <<<<<<<< int start_internal_size = heap_caps_get_free_size(MALLOC_CAP_INTERNAL); srmodel_list_t *models = esp_srmodel_init("model"); char *model_name = esp_srmodel_filter(models, ESP_WN_PREFIX, NULL); + char *vad_model_name = NULL; +#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4) + vad_model_name = esp_srmodel_filter(models, ESP_VADN_PREFIX, NULL); +#endif esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)&ESP_AFE_SR_HANDLE; afe_config_t afe_config = AFE_CONFIG_DEFAULT(); @@ -79,7 +85,10 @@ TEST_CASE(">>>>>>>> audio_front_end SR create/destroy API & memory leak <<<<<<<< afe_config.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM; afe_config.wakenet_model_name = model_name; afe_config.voice_communication_init = false; - + afe_config.vad_model_name = vad_model_name; + if (vad_model_name) { + printf("vad_model_name:%s\n", vad_model_name); + } // test model loading time struct timeval tv_start, tv_end; @@ -106,7 +115,11 @@ TEST_CASE(">>>>>>>> audio_front_end SR create/destroy API & memory leak <<<<<<<< printf("init partition ...\n"); models = esp_srmodel_init("model"); model_name = esp_srmodel_filter(models, ESP_WN_PREFIX, NULL); +#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4) + vad_model_name = esp_srmodel_filter(models, ESP_VADN_PREFIX, NULL); +#endif afe_config.wakenet_model_name = model_name; + afe_config.vad_model_name = vad_model_name; printf("create ...\n"); afe_data = afe_handle->create_from_config(&afe_config); diff --git a/test_apps/esp-sr/main/test_multinet.cpp b/test_apps/esp-sr/main/test_multinet.cpp index 49778f0..9db0a1c 100644 --- a/test_apps/esp-sr/main/test_multinet.cpp +++ b/test_apps/esp-sr/main/test_multinet.cpp @@ -93,6 +93,7 @@ TEST_CASE("multinet cpu loading", "[mn]") struct timeval tv_start, tv_end; gettimeofday(&tv_start, NULL); esp_mn_state_t mn_state; + multinet->print_active_speech_commands(model_data); while (1) { if ((chunks + 1)*audio_chunksize <= data_size) { diff --git a/test_apps/esp-sr/pytest_esp_sr.py b/test_apps/esp-sr/pytest_esp_sr.py index 53de838..cfd3e41 100644 --- a/test_apps/esp-sr/pytest_esp_sr.py +++ b/test_apps/esp-sr/pytest_esp_sr.py @@ -15,10 +15,7 @@ from pytest_embedded import Dut ], ) def test_multinet_s3(dut: Dut)-> None: - # dut.run_all_single_board_cases(group="mn") - dut.expect_exact('Press ENTER to see the list of tests.') - dut.write('[mn]') - dut.expect_unity_test_output(timeout = 1000) + dut.run_all_single_board_cases(group="mn") @pytest.mark.target('esp32p4') @pytest.mark.env('esp32p4') @@ -30,10 +27,7 @@ def test_multinet_s3(dut: Dut)-> None: ], ) def test_multinet_p4(dut: Dut)-> None: - # dut.run_all_single_board_cases(group="mn") - dut.expect_exact('Press ENTER to see the list of tests.') - dut.write('[mn]') - dut.expect_unity_test_output(timeout = 1000) + dut.run_all_single_board_cases(group="mn") @pytest.mark.target('esp32s3') @@ -46,10 +40,7 @@ def test_multinet_p4(dut: Dut)-> None: ], ) def test_wakenet(dut: Dut)-> None: - # dut.run_all_single_board_cases(group="wn") - dut.expect_exact('Press ENTER to see the list of tests.') - dut.write('[wn]') - dut.expect_unity_test_output(timeout = 1000) + dut.run_all_single_board_cases(group="wn") @pytest.mark.target('esp32p4') @pytest.mark.env('esp32p4') @@ -61,10 +52,7 @@ def test_wakenet(dut: Dut)-> None: ], ) def test_wakenet_p4(dut: Dut)-> None: - # dut.run_all_single_board_cases(group="wn") - dut.expect_exact('Press ENTER to see the list of tests.') - dut.write('[wn]') - dut.expect_unity_test_output(timeout = 1000) + dut.run_all_single_board_cases(group="wn") @pytest.mark.target('esp32s3') @pytest.mark.env('esp32s3') @@ -72,13 +60,11 @@ def test_wakenet_p4(dut: Dut)-> None: 'config', [ 'wn9_hilexin', + 'vadnet', ], ) def test_sr_afe(dut: Dut)-> None: - # dut.run_all_single_board_cases(group="afe") - dut.expect_exact('Press ENTER to see the list of tests.') - dut.write('[afe_sr]') - dut.expect_unity_test_output(timeout = 1000) + dut.run_all_single_board_cases(group="afe_sr", timeout=100000) @pytest.mark.target('esp32p4') @pytest.mark.env('esp32p4') @@ -89,10 +75,7 @@ def test_sr_afe(dut: Dut)-> None: ], ) def test_sr_afe_p4(dut: Dut)-> None: - # dut.run_all_single_board_cases(group="afe") - dut.expect_exact('Press ENTER to see the list of tests.') - dut.write('[afe_sr]') - dut.expect_unity_test_output(timeout = 1000) + dut.run_all_single_board_cases(group="afe_sr", timeout=100000) @pytest.mark.target('esp32s3') @@ -104,10 +87,7 @@ def test_sr_afe_p4(dut: Dut)-> None: ], ) def test_vc_afe(dut: Dut)-> None: - # dut.run_all_single_board_cases(group="afe") - dut.expect_exact('Press ENTER to see the list of tests.') - dut.write('[afe_vc]') - dut.expect_unity_test_output(timeout = 100000) + dut.run_all_single_board_cases(group="afe_vc", timeout=100000) @pytest.mark.target('esp32p4') @@ -119,7 +99,4 @@ def test_vc_afe(dut: Dut)-> None: ], ) def test_vc_afe_p4(dut: Dut)-> None: - # dut.run_all_single_board_cases(group="afe") - dut.expect_exact('Press ENTER to see the list of tests.') - dut.write('[afe_vc]') - dut.expect_unity_test_output(timeout = 100000) \ No newline at end of file + dut.run_all_single_board_cases(group="afe_vc", timeout=100000) \ No newline at end of file diff --git a/test_apps/esp-sr/sdkconfig.ci.mn5q8_cn b/test_apps/esp-sr/sdkconfig.ci.mn5q8_cn index b83bd36..ac866c2 100644 --- a/test_apps/esp-sr/sdkconfig.ci.mn5q8_cn +++ b/test_apps/esp-sr/sdkconfig.ci.mn5q8_cn @@ -6,6 +6,7 @@ CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16 CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y +CONFIG_SR_WN_WN9_HILEXIN=y CONFIG_SR_MN_CN_MULTINET5_RECOGNITION_QUANT8=y CONFIG_SPIRAM=y CONFIG_SPIRAM_MODE_OCT=y diff --git a/test_apps/esp-sr/sdkconfig.ci.mn5q8_en b/test_apps/esp-sr/sdkconfig.ci.mn5q8_en index fe47163..37eb87f 100644 --- a/test_apps/esp-sr/sdkconfig.ci.mn5q8_en +++ b/test_apps/esp-sr/sdkconfig.ci.mn5q8_en @@ -1,5 +1,5 @@ # This file was generated using idf.py save-defconfig. It can be edited manually. -# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration +# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration # CONFIG_IDF_TARGET="esp32s3" CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16 @@ -7,7 +7,6 @@ CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y CONFIG_SR_WN_WN9_HIESP=y -CONFIG_SR_MN_CN_NONE=y CONFIG_SR_MN_EN_MULTINET5_SINGLE_RECOGNITION_QUANT8=y CONFIG_ESP_PHY_REDUCE_TX_POWER=y CONFIG_SPIRAM=y diff --git a/test_apps/esp-sr/sdkconfig.ci.mn6_cn b/test_apps/esp-sr/sdkconfig.ci.mn6_cn index 1b40076..d8705c2 100644 --- a/test_apps/esp-sr/sdkconfig.ci.mn6_cn +++ b/test_apps/esp-sr/sdkconfig.ci.mn6_cn @@ -1,11 +1,13 @@ # This file was generated using idf.py save-defconfig. It can be edited manually. -# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration +# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration # CONFIG_IDF_TARGET="esp32s3" CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16 CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y +CONFIG_SR_WN_WN9_HILEXIN=y +CONFIG_SR_MN_CN_MULTINET6_QUANT=y CONFIG_SPIRAM=y CONFIG_SPIRAM_MODE_OCT=y CONFIG_SPIRAM_SPEED_80M=y diff --git a/test_apps/esp-sr/sdkconfig.ci.mn6_en b/test_apps/esp-sr/sdkconfig.ci.mn6_en index 0e8fbe2..bef8835 100644 --- a/test_apps/esp-sr/sdkconfig.ci.mn6_en +++ b/test_apps/esp-sr/sdkconfig.ci.mn6_en @@ -1,12 +1,12 @@ # This file was generated using idf.py save-defconfig. It can be edited manually. -# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration +# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration # CONFIG_IDF_TARGET="esp32s3" CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16 CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y -CONFIG_SR_MN_CN_NONE=y +CONFIG_SR_WN_WN9_HIESP=y CONFIG_SR_MN_EN_MULTINET6_QUANT=y CONFIG_ESP_PHY_REDUCE_TX_POWER=y CONFIG_SPIRAM=y diff --git a/test_apps/esp-sr/sdkconfig.ci.mn7_en b/test_apps/esp-sr/sdkconfig.ci.mn7_en index 3231fe7..fe88911 100644 --- a/test_apps/esp-sr/sdkconfig.ci.mn7_en +++ b/test_apps/esp-sr/sdkconfig.ci.mn7_en @@ -1,12 +1,12 @@ # This file was generated using idf.py save-defconfig. It can be edited manually. -# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration +# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration # CONFIG_IDF_TARGET="esp32s3" CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16 CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y -CONFIG_SR_MN_CN_NONE=y +CONFIG_SR_WN_WN9_HIESP=y CONFIG_SR_MN_EN_MULTINET7_QUANT=y CONFIG_SPIRAM=y CONFIG_SPIRAM_MODE_OCT=y diff --git a/test_apps/esp-sr/sdkconfig.ci.nsnet2 b/test_apps/esp-sr/sdkconfig.ci.nsnet2 index bfce388..e421ae1 100644 --- a/test_apps/esp-sr/sdkconfig.ci.nsnet2 +++ b/test_apps/esp-sr/sdkconfig.ci.nsnet2 @@ -1,13 +1,12 @@ # This file was generated using idf.py save-defconfig. It can be edited manually. -# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration +# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration # CONFIG_IDF_TARGET="esp32s3" CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16 CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y -CONFIG_USE_NSNET=y -CONFIG_USE_MULTINET=n +CONFIG_SR_NSN_NSNET2=y CONFIG_SPIRAM=y CONFIG_SPIRAM_MODE_OCT=y CONFIG_SPIRAM_SPEED_80M=y diff --git a/test_apps/esp-sr/sdkconfig.ci.p4_mn7_cn b/test_apps/esp-sr/sdkconfig.ci.p4_mn7_cn index 32a2bbf..36f99e6 100644 --- a/test_apps/esp-sr/sdkconfig.ci.p4_mn7_cn +++ b/test_apps/esp-sr/sdkconfig.ci.p4_mn7_cn @@ -5,6 +5,7 @@ CONFIG_IDF_TARGET="esp32p4" CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y +CONFIG_SR_WN_WN9_HILEXIN=y CONFIG_SR_MN_CN_MULTINET7_QUANT=y CONFIG_COMPILER_OPTIMIZATION_PERF=y CONFIG_ESP32P4_REV_MIN_0=y diff --git a/test_apps/esp-sr/sdkconfig.ci.p4_nsnet2 b/test_apps/esp-sr/sdkconfig.ci.p4_nsnet2 index f7c4e63..f9e7d8d 100644 --- a/test_apps/esp-sr/sdkconfig.ci.p4_nsnet2 +++ b/test_apps/esp-sr/sdkconfig.ci.p4_nsnet2 @@ -5,9 +5,8 @@ CONFIG_IDF_TARGET="esp32p4" CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y -CONFIG_USE_NSNET=y CONFIG_SR_WN_WN9_HIESP=y -CONFIG_USE_MULTINET=n +CONFIG_SR_NSN_NSNET2=y CONFIG_COMPILER_OPTIMIZATION_PERF=y CONFIG_ESP32P4_REV_MIN_0=y CONFIG_SPIRAM=y diff --git a/test_apps/esp-sr/sdkconfig.ci.vadnet b/test_apps/esp-sr/sdkconfig.ci.vadnet new file mode 100644 index 0000000..235a812 --- /dev/null +++ b/test_apps/esp-sr/sdkconfig.ci.vadnet @@ -0,0 +1,24 @@ +# This file was generated using idf.py save-defconfig. It can be edited manually. +# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration +# +CONFIG_IDF_TARGET="esp32s3" +CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16 +CONFIG_ESPTOOLPY_FLASHMODE_QIO=y +CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y +CONFIG_PARTITION_TABLE_CUSTOM=y +CONFIG_SR_VADN_VADNET1_MEDIUM=y +CONFIG_SR_WN_WN9_HILEXIN=y +CONFIG_SPIRAM=y +CONFIG_SPIRAM_MODE_OCT=y +CONFIG_SPIRAM_SPEED_80M=y +CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y +CONFIG_ESP32S3_INSTRUCTION_CACHE_32KB=y +CONFIG_ESP32S3_DATA_CACHE_64KB=y +CONFIG_ESP32S3_DATA_CACHE_LINE_64B=y +CONFIG_ESP_MAIN_TASK_STACK_SIZE=8192 +CONFIG_ESP_WIFI_GMAC_SUPPORT=n +CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID=y +CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS=y +CONFIG_LWIP_TCP_SND_BUF_DEFAULT=5744 +CONFIG_LWIP_TCP_WND_DEFAULT=5744 +CONFIG_UNITY_CRITICAL_LEAK_LEVEL_GENERAL=1024 diff --git a/test_apps/esp-sr/sdkconfig.ci.wn9_hilexin b/test_apps/esp-sr/sdkconfig.ci.wn9_hilexin index 1ea1ba0..e002d55 100644 --- a/test_apps/esp-sr/sdkconfig.ci.wn9_hilexin +++ b/test_apps/esp-sr/sdkconfig.ci.wn9_hilexin @@ -1,13 +1,12 @@ # This file was generated using idf.py save-defconfig. It can be edited manually. -# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration +# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration # CONFIG_IDF_TARGET="esp32s3" CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16 CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y -CONFIG_USE_NSNET=y -CONFIG_USE_MULTINET=n +CONFIG_SR_WN_WN9_HILEXIN=y CONFIG_ESP_PHY_REDUCE_TX_POWER=y CONFIG_SPIRAM=y CONFIG_SPIRAM_MODE_OCT=y diff --git a/test_apps/esp-tts/sdkconfig.ci.p4 b/test_apps/esp-tts/sdkconfig.ci.p4 index 7fdc121..b35b0b5 100644 --- a/test_apps/esp-tts/sdkconfig.ci.p4 +++ b/test_apps/esp-tts/sdkconfig.ci.p4 @@ -1,13 +1,10 @@ # This file was generated using idf.py save-defconfig. It can be edited manually. -# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration +# Espressif IoT Development Framework (ESP-IDF) 5.3.1 Project Minimal Configuration # CONFIG_IDF_TARGET="esp32p4" CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y -CONFIG_USE_AFE=n -CONFIG_USE_WAKENET=n -CONFIG_USE_MULTINET=n CONFIG_COMPILER_OPTIMIZATION_PERF=y CONFIG_ESP32P4_REV_MIN_0=y CONFIG_SPIRAM=y diff --git a/test_apps/esp-tts/sdkconfig.ci.s3 b/test_apps/esp-tts/sdkconfig.ci.s3 index 1b40076..67f900f 100644 --- a/test_apps/esp-tts/sdkconfig.ci.s3 +++ b/test_apps/esp-tts/sdkconfig.ci.s3 @@ -1,5 +1,5 @@ # This file was generated using idf.py save-defconfig. It can be edited manually. -# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration +# Espressif IoT Development Framework (ESP-IDF) 5.3.1 Project Minimal Configuration # CONFIG_IDF_TARGET="esp32s3" CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16 @@ -13,6 +13,9 @@ CONFIG_ESP_DEFAULT_CPU_FREQ_MHZ_240=y CONFIG_ESP32S3_INSTRUCTION_CACHE_32KB=y CONFIG_ESP32S3_DATA_CACHE_64KB=y CONFIG_ESP32S3_DATA_CACHE_LINE_64B=y +CONFIG_ESP_SYSTEM_ALLOW_RTC_FAST_MEM_AS_HEAP=n +CONFIG_ESP_INT_WDT=n +CONFIG_ESP_TASK_WDT_EN=n CONFIG_ESP_WIFI_GMAC_SUPPORT=n CONFIG_LWIP_TCP_SND_BUF_DEFAULT=5744 CONFIG_LWIP_TCP_WND_DEFAULT=5744