diff --git a/CHANGELOG.md b/CHANGELOG.md index 7497327..a542118 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,11 @@ - Available storage is less than the remaining flash space on IDF v5.0. If you can not map model partition successfully, please check the left free storage by `spi_flash_mmap_get_free_pages(ESP_PARTITION_MMAP_DATA)` or update IDF to v5.1 or later. +## 1.8.0 +- Support esp-idf v5.3 +- Add more new wake words +- Add setting "fixed_first_channel" in afe_config + ## 1.7.1 - Add 喵喵同学,Hi,joy, (Hi,Lily/Hi,莉莉) wakenet model diff --git a/Kconfig.projbuild b/Kconfig.projbuild index 6b1bc35..ddc3ba6 100644 --- a/Kconfig.projbuild +++ b/Kconfig.projbuild @@ -162,6 +162,10 @@ choice SR_WN_MODEL_LOAD bool "Hi,Lily/Hi,莉莉 (wn9_hilili_tts)" depends on IDF_TARGET_ESP32S3 + config SR_WN_WN9_HITELLY_TTS + bool "Hi,Telly/Hi,泰力 (wn9_hitelly_tts)" + depends on IDF_TARGET_ESP32S3 + config SR_WN_WN9_HEYWANDA_TTS bool "Hey,Wanda (wn9_heywanda_tts)" depends on IDF_TARGET_ESP32S3 @@ -269,6 +273,10 @@ menu "Load Multiple Wake Words" bool "Hi,Lily/Hi,莉莉 (wn9_hilili_tts)" depends on IDF_TARGET_ESP32S3 + config SR_WN_WN9_HITELLY_TTS_MULTI + bool "Hi,Telly/Hi,泰力 (wn9_hitelly_tts)" + depends on IDF_TARGET_ESP32S3 + config SR_WN_WN9_XIAOBINXIAOBIN_TTS_MULTI bool "小滨小滨/小冰小冰 (wn9_xiaobinxiaobin_tts)" depends on IDF_TARGET_ESP32S3 diff --git a/README.md b/README.md index 78b9fd4..2dedb0b 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,7 @@ The following wake words are supported in esp-sr: |喵喵同学 | | wn9_miaomiaotongxue_tts| |Hi,喵喵 | | wn9_himiaomiao_tts | |Hi,Lily/Hi,莉莉 | | wn9_hilili_tts | +|Hi,Telly/Hi,泰力 | | wn9_hitelly_tts | |小滨小滨/小冰小冰| | wn9_xiaobinxiaobin_tts | *NOTE:* `_tts` suffix means this WakeNet model is trained by TTS samples. diff --git a/idf_component.yml b/idf_component.yml index ec51140..0d47715 100644 --- a/idf_component.yml +++ b/idf_component.yml @@ -1,4 +1,4 @@ -version: "1.7.1" +version: "1.8.0" description: esp_sr provides basic algorithms for Speech Recognition applications url: https://github.com/espressif/esp-sr dependencies: diff --git a/include/esp32s3/dl_lib_matrix.h b/include/esp32s3/dl_lib_matrix.h index b5fae74..59f7d79 100644 --- a/include/esp32s3/dl_lib_matrix.h +++ b/include/esp32s3/dl_lib_matrix.h @@ -25,10 +25,6 @@ extern "C" { #endif -// #ifdef CONFIG_IDF_TARGET_ESP32S3 -// #include "dl_tie728_bzero.h" -// #endif - typedef float fptp_t; #if CONFIG_BT_SHARE_MEM_REUSE diff --git a/include/esp32s3/esp_aec.h b/include/esp32s3/esp_aec.h index 03afc90..deb031c 100644 --- a/include/esp32s3/esp_aec.h +++ b/include/esp32s3/esp_aec.h @@ -23,7 +23,8 @@ extern "C" { #define USE_AEC_FFT // Not kiss_fft #define AEC_USE_SPIRAM 0 #define AEC_SAMPLE_RATE 16000 // Only Support 16000Hz -#define AEC_FRAME_LENGTH_MS 16 +//#define AEC_FRAME_LENGTH_MS 16 +#define AEC_FRAME_LENGTH_MS 32 #define AEC_FILTER_LENGTH 1200 // Number of samples of echo to cancel typedef void* aec_handle_t; diff --git a/include/esp32s3/esp_afe_config.h b/include/esp32s3/esp_afe_config.h index e4c681e..8b16772 100644 --- a/include/esp32s3/esp_afe_config.h +++ b/include/esp32s3/esp_afe_config.h @@ -90,6 +90,8 @@ typedef struct { afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX]; afe_ns_mode_t afe_ns_mode; char *afe_ns_model_name; + bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone + // otherwise, select channel number by wakenet } afe_config_t; @@ -121,6 +123,37 @@ typedef struct { .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \ .afe_ns_mode = NS_MODE_SSP, \ .afe_ns_model_name = NULL, \ + .fixed_first_channel = true, \ +} +#elif CONFIG_IDF_TARGET_ESP32P4 +#define AFE_CONFIG_DEFAULT() { \ + .aec_init = true, \ + .se_init = true, \ + .vad_init = true, \ + .wakenet_init = true, \ + .voice_communication_init = false, \ + .voice_communication_agc_init = false, \ + .voice_communication_agc_gain = 15, \ + .vad_mode = VAD_MODE_3, \ + .wakenet_model_name = NULL, \ + .wakenet_model_name_2 = NULL, \ + .wakenet_mode = DET_MODE_90, \ + .afe_mode = SR_MODE_HIGH_PERF, \ + .afe_perferred_core = 0, \ + .afe_perferred_priority = 5, \ + .afe_ringbuf_size = 50, \ + .memory_alloc_mode = AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE, \ + .afe_linear_gain = 1.0, \ + .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \ + .pcm_config.total_ch_num = 3, \ + .pcm_config.mic_num = 2, \ + .pcm_config.ref_num = 1, \ + .pcm_config.sample_rate = 16000, \ + .debug_init = false, \ + .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \ + .afe_ns_mode = NS_MODE_SSP, \ + .afe_ns_model_name = NULL, \ + .fixed_first_channel = true, \ } #elif CONFIG_IDF_TARGET_ESP32S3 #define AFE_CONFIG_DEFAULT() { \ @@ -150,6 +183,7 @@ typedef struct { .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \ .afe_ns_mode = NS_MODE_SSP, \ .afe_ns_model_name = NULL, \ + .fixed_first_channel = true, \ } #endif diff --git a/include/esp32s3/esp_nsn_models.h b/include/esp32s3/esp_nsn_models.h index 7b3bf49..8165e27 100644 --- a/include/esp32s3/esp_nsn_models.h +++ b/include/esp32s3/esp_nsn_models.h @@ -14,4 +14,4 @@ Now there are nsnet1 and nsnet2 * @param model_name The name of model * @returns The handle of multinet */ -esp_nsn_iface_t *esp_nsnet_handle_from_name(char *model_name); \ No newline at end of file +esp_nsn_iface_t *esp_nsnet_handle_from_name(char *model_name); diff --git a/lib/esp32s3/libc_speech_features.a b/lib/esp32s3/libc_speech_features.a index f09717c..d365aa6 100644 Binary files a/lib/esp32s3/libc_speech_features.a and b/lib/esp32s3/libc_speech_features.a differ diff --git a/lib/esp32s3/libdl_lib.a b/lib/esp32s3/libdl_lib.a index 8315980..d217cb4 100644 Binary files a/lib/esp32s3/libdl_lib.a and b/lib/esp32s3/libdl_lib.a differ diff --git a/lib/esp32s3/libesp_audio_front_end.a b/lib/esp32s3/libesp_audio_front_end.a index 0d7db59..7a70f8a 100644 Binary files a/lib/esp32s3/libesp_audio_front_end.a and b/lib/esp32s3/libesp_audio_front_end.a differ diff --git a/lib/esp32s3/libesp_audio_processor.a b/lib/esp32s3/libesp_audio_processor.a index 8fd8b01..baeb11e 100644 Binary files a/lib/esp32s3/libesp_audio_processor.a and b/lib/esp32s3/libesp_audio_processor.a differ diff --git a/lib/esp32s3/libflite_g2p.a b/lib/esp32s3/libflite_g2p.a index 99d7920..f25d1cf 100644 Binary files a/lib/esp32s3/libflite_g2p.a and b/lib/esp32s3/libflite_g2p.a differ diff --git a/lib/esp32s3/libfst.a b/lib/esp32s3/libfst.a index 1e768bb..7214499 100644 Binary files a/lib/esp32s3/libfst.a and b/lib/esp32s3/libfst.a differ diff --git a/lib/esp32s3/libhufzip.a b/lib/esp32s3/libhufzip.a index dec83b0..fab63d7 100644 Binary files a/lib/esp32s3/libhufzip.a and b/lib/esp32s3/libhufzip.a differ diff --git a/lib/esp32s3/libmultinet.a b/lib/esp32s3/libmultinet.a index e07967b..f23ce94 100644 Binary files a/lib/esp32s3/libmultinet.a and b/lib/esp32s3/libmultinet.a differ diff --git a/lib/esp32s3/libnsnet.a b/lib/esp32s3/libnsnet.a index 402023f..859197e 100644 Binary files a/lib/esp32s3/libnsnet.a and b/lib/esp32s3/libnsnet.a differ diff --git a/lib/esp32s3/libwakenet.a b/lib/esp32s3/libwakenet.a index 625de12..890b848 100644 Binary files a/lib/esp32s3/libwakenet.a and b/lib/esp32s3/libwakenet.a differ diff --git a/model/wakenet_model/wn9_hitelly_tts/_MODEL_INFO_ b/model/wakenet_model/wn9_hitelly_tts/_MODEL_INFO_ new file mode 100644 index 0000000..dc0dff4 --- /dev/null +++ b/model/wakenet_model/wn9_hitelly_tts/_MODEL_INFO_ @@ -0,0 +1 @@ +wakenet9l_tts1h8_Hi,Telly or Hi,泰力_3_0.613_0.619 diff --git a/model/wakenet_model/wn9_hitelly_tts/wn9_data b/model/wakenet_model/wn9_hitelly_tts/wn9_data new file mode 100644 index 0000000..44e070c Binary files /dev/null and b/model/wakenet_model/wn9_hitelly_tts/wn9_data differ diff --git a/model/wakenet_model/wn9_hitelly_tts/wn9_index b/model/wakenet_model/wn9_hitelly_tts/wn9_index new file mode 100644 index 0000000..5e7c881 Binary files /dev/null and b/model/wakenet_model/wn9_hitelly_tts/wn9_index differ diff --git a/src/esp_mn_speech_commands.c b/src/esp_mn_speech_commands.c index ddb0b55..344b013 100644 --- a/src/esp_mn_speech_commands.c +++ b/src/esp_mn_speech_commands.c @@ -156,6 +156,64 @@ esp_err_t esp_mn_commands_add(int command_id, char *string) return ESP_OK; } +esp_err_t esp_mn_commands_phoneme_add(int command_id, char *string, char *phonemes) +{ + if (NULL == esp_mn_root || esp_mn_model_handle == NULL || esp_mn_model_data == NULL) { + ESP_LOGE(TAG, "Please create mn model first.\n"); + return ESP_ERR_INVALID_STATE; + } + esp_mn_node_t *temp = esp_mn_root; + int last_node_elem_num = esp_mn_commands_num(); + ESP_RETURN_ON_FALSE(ESP_MN_MAX_PHRASE_NUM >= last_node_elem_num, ESP_ERR_INVALID_STATE, TAG, "The number of speech commands exceed ESP_MN_MAX_PHRASE_NUM"); + +#ifdef CONFIG_SR_MN_EN_MULTINET7_QUANT + if (esp_mn_model_handle->check_speech_command(esp_mn_model_data, phonemes) == 0) { + // error message is printed inside check_speech_command + ESP_LOGE(TAG, "invalid command, please check format, %s (%s).\n", string, phonemes); + return ESP_ERR_INVALID_STATE; + } +#else + if (esp_mn_model_handle->check_speech_command(esp_mn_model_data, string) == 0) { + // error message is printed inside check_speech_command + ESP_LOGE(TAG, "invalid command, please check format, %s.\n", string); + return ESP_ERR_INVALID_STATE; + } +#endif + + temp = esp_mn_command_search(string); + + if (temp != NULL) { + // command already exists + if (command_id != temp->phrase->command_id) { + // change command id + temp->phrase->command_id = command_id; + } else { + // it's exactly the same, do nothing + ESP_LOGI(TAG, "command %d: (%s) already exists.", command_id, string); + } + return ESP_OK; + } + + temp = esp_mn_root; + + esp_mn_phrase_t *phrase = esp_mn_phrase_alloc(command_id, string); + if (phrase == NULL) { + return ESP_ERR_INVALID_STATE; + } + int phoneme_len = strlen(phonemes); + phrase->phonemes = _esp_mn_calloc_(phoneme_len+1, sizeof(char)); + memcpy(phrase->phonemes, phonemes, phoneme_len); + phrase->phonemes[phoneme_len] = '\0'; + + esp_mn_node_t *new_node = esp_mn_node_alloc(phrase); + while (temp->next != NULL) { + temp = temp->next; + } + temp->next = new_node; + + return ESP_OK; +} + esp_err_t esp_mn_commands_modify(char *old_string, char *new_string) { #ifdef CONFIG_SR_MN_EN_MULTINET7_QUANT diff --git a/src/include/esp_mn_speech_commands.h b/src/include/esp_mn_speech_commands.h index 335c88f..d643c82 100644 --- a/src/include/esp_mn_speech_commands.h +++ b/src/include/esp_mn_speech_commands.h @@ -59,6 +59,20 @@ esp_err_t esp_mn_commands_free(void); */ esp_err_t esp_mn_commands_add(int command_id, char *string); +/** + * @brief Add one speech commands with command string, command phonemes and command ID + * Please use multinet_g2p.py[esp-sr/tool/multinet_g2p.py] to generate phonemes from command string. + * + * @param command_id The command ID + * @param string The command string of the speech commands + * @param phonemes The phonemes of the speech commands + * + * @return + * - ESP_OK Success + * - ESP_ERR_INVALID_STATE Fail + */ +esp_err_t esp_mn_commands_phoneme_add(int command_id, char *string, char *phonemes); + /** * @brief Modify one speech commands with new command string * @@ -178,4 +192,4 @@ void esp_mn_active_commands_print(void); #ifdef __cplusplus } -#endif \ No newline at end of file +#endif diff --git a/src/model_path.c b/src/model_path.c index 9e34b6c..9c1267a 100644 --- a/src/model_path.c +++ b/src/model_path.c @@ -4,7 +4,9 @@ #include "string.h" #include "model_path.h" #include "esp_wn_models.h" +#ifndef CONFIG_IDF_TARGET_ESP32P4 #include "esp_mn_models.h" +#endif #ifdef ESP_PLATFORM #include @@ -244,6 +246,7 @@ void srmodel_spiffs_deinit(srmodel_list_t *models) } +#ifdef CONFIG_IDF_TARGET_ESP32 srmodel_list_t *srmodel_config_init() { if (static_srmodels == NULL) { @@ -305,6 +308,7 @@ void srmodel_config_deinit(srmodel_list_t *models) // models is static_srmodels static_srmodels = NULL; } +#endif model_coeff_getter_t *srmodel_get_model_coeff(char *model_name) {