Merge branch 'feat/support_idf_5_3' into 'master'

Feat/support idf 5 3 See merge request speech-recognition-framework/esp-sr!103
2025-09-15 15:28:44 +08:00 · 2024-08-05 10:58:33 +08:00 · 2024-08-05 10:58:33 +08:00 · 5d718db85a
commit 5d718db85a
parent 2058b4eb3a 42d2a77abd
24 changed files with 130 additions and 8 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,6 +4,11 @@
 - Available storage is less than the remaining flash space on IDF v5.0.   
 If you can not map model partition successfully, please check the left free storage by `spi_flash_mmap_get_free_pages(ESP_PARTITION_MMAP_DATA)` or update IDF to v5.1 or later.

+## 1.8.0
+- Support esp-idf v5.3
+- Add more new wake words
+- Add setting "fixed_first_channel" in afe_config
+ 
 ## 1.7.1
 - Add 喵喵同学，Hi,joy, (Hi,Lily/Hi,莉莉) wakenet model

--- a/Kconfig.projbuild
+++ b/Kconfig.projbuild
@ -162,6 +162,10 @@ choice SR_WN_MODEL_LOAD
        bool "Hi,Lily/Hi,莉莉 (wn9_hilili_tts)"
        depends on IDF_TARGET_ESP32S3

+    config SR_WN_WN9_HITELLY_TTS
+        bool "Hi,Telly/Hi,泰力 (wn9_hitelly_tts)"
+        depends on IDF_TARGET_ESP32S3
+
    config SR_WN_WN9_HEYWANDA_TTS
        bool "Hey,Wanda (wn9_heywanda_tts)"
        depends on IDF_TARGET_ESP32S3
@ -269,6 +273,10 @@ menu "Load Multiple Wake Words"
    bool "Hi,Lily/Hi,莉莉 (wn9_hilili_tts)"
    depends on IDF_TARGET_ESP32S3

+    config SR_WN_WN9_HITELLY_TTS_MULTI
+    bool "Hi,Telly/Hi,泰力 (wn9_hitelly_tts)"
+    depends on IDF_TARGET_ESP32S3
+
    config SR_WN_WN9_XIAOBINXIAOBIN_TTS_MULTI
    bool "小滨小滨/小冰小冰 (wn9_xiaobinxiaobin_tts)"
    depends on IDF_TARGET_ESP32S3
--- a/README.md
+++ b/README.md
@ -54,6 +54,7 @@ The following wake words are supported in esp-sr:
 |喵喵同学         |                                        | wn9_miaomiaotongxue_tts|
 |Hi,喵喵          |                                        | wn9_himiaomiao_tts     |
 |Hi,Lily/Hi,莉莉  |                                        | wn9_hilili_tts         |
+|Hi,Telly/Hi,泰力 |                                        | wn9_hitelly_tts         |
 |小滨小滨/小冰小冰|                                        | wn9_xiaobinxiaobin_tts |

 *NOTE:* `_tts` suffix means this WakeNet model is trained by TTS samples.
--- a/idf_component.yml
+++ b/idf_component.yml
@ -1,4 +1,4 @@
-version: "1.7.1"
+version: "1.8.0"
 description: esp_sr provides basic algorithms for Speech Recognition applications
 url: https://github.com/espressif/esp-sr
 dependencies:
--- a/include/esp32s3/dl_lib_matrix.h
+++ b/include/esp32s3/dl_lib_matrix.h
@ -25,10 +25,6 @@
 extern "C" {
 #endif

-// #ifdef CONFIG_IDF_TARGET_ESP32S3
-// #include "dl_tie728_bzero.h"
-// #endif
-
 typedef float fptp_t;

 #if CONFIG_BT_SHARE_MEM_REUSE
--- a/include/esp32s3/esp_aec.h
+++ b/include/esp32s3/esp_aec.h
@ -23,7 +23,8 @@ extern "C" {
 #define USE_AEC_FFT                      // Not kiss_fft
 #define AEC_USE_SPIRAM      0
 #define AEC_SAMPLE_RATE     16000        // Only Support 16000Hz
-#define AEC_FRAME_LENGTH_MS 16
+//#define AEC_FRAME_LENGTH_MS 16
+#define AEC_FRAME_LENGTH_MS 32
 #define AEC_FILTER_LENGTH   1200         // Number of samples of echo to cancel

 typedef void* aec_handle_t;
--- a/include/esp32s3/esp_afe_config.h
+++ b/include/esp32s3/esp_afe_config.h
@ -90,6 +90,8 @@ typedef struct {
    afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX];
    afe_ns_mode_t afe_ns_mode;
    char *afe_ns_model_name;
+    bool fixed_first_channel;                // If true, the channel after first wake-up is fixed to raw data of microphone
+                                             // otherwise, select channel number by wakenet
 } afe_config_t;


@ -121,6 +123,37 @@ typedef struct {
    .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
    .afe_ns_mode = NS_MODE_SSP, \
    .afe_ns_model_name = NULL, \
+    .fixed_first_channel = true, \
+}
+#elif CONFIG_IDF_TARGET_ESP32P4
+#define AFE_CONFIG_DEFAULT() { \
+    .aec_init = true, \
+    .se_init = true, \
+    .vad_init = true, \
+    .wakenet_init = true, \
+    .voice_communication_init = false, \
+    .voice_communication_agc_init = false, \
+    .voice_communication_agc_gain = 15, \
+    .vad_mode = VAD_MODE_3, \
+    .wakenet_model_name = NULL, \
+    .wakenet_model_name_2 = NULL, \
+    .wakenet_mode = DET_MODE_90, \
+    .afe_mode = SR_MODE_HIGH_PERF, \
+    .afe_perferred_core = 0, \
+    .afe_perferred_priority = 5, \
+    .afe_ringbuf_size = 50, \
+    .memory_alloc_mode = AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE, \
+    .afe_linear_gain = 1.0, \
+    .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
+    .pcm_config.total_ch_num = 3, \
+    .pcm_config.mic_num = 2, \
+    .pcm_config.ref_num = 1, \
+    .pcm_config.sample_rate = 16000, \
+    .debug_init = false, \
+    .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
+    .afe_ns_mode = NS_MODE_SSP, \
+    .afe_ns_model_name = NULL, \
+    .fixed_first_channel = true, \
 }
 #elif CONFIG_IDF_TARGET_ESP32S3
 #define AFE_CONFIG_DEFAULT() { \
@ -150,6 +183,7 @@ typedef struct {
    .debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
    .afe_ns_mode = NS_MODE_SSP, \
    .afe_ns_model_name = NULL, \
+    .fixed_first_channel = true, \
 }
 #endif

--- a/include/esp32s3/esp_nsn_models.h
+++ b/include/esp32s3/esp_nsn_models.h
@ -14,4 +14,4 @@ Now there are nsnet1 and nsnet2
 * @param model_name   The name of model 
 * @returns The handle of multinet
 */
-esp_nsn_iface_t *esp_nsnet_handle_from_name(char *model_name);
+esp_nsn_iface_t *esp_nsnet_handle_from_name(char *model_name);
--- a/lib/esp32s3/libc_speech_features.a
+++ b/lib/esp32s3/libc_speech_features.a
--- a/lib/esp32s3/libdl_lib.a
+++ b/lib/esp32s3/libdl_lib.a
--- a/lib/esp32s3/libesp_audio_front_end.a
+++ b/lib/esp32s3/libesp_audio_front_end.a
--- a/lib/esp32s3/libesp_audio_processor.a
+++ b/lib/esp32s3/libesp_audio_processor.a
--- a/lib/esp32s3/libflite_g2p.a
+++ b/lib/esp32s3/libflite_g2p.a
--- a/lib/esp32s3/libfst.a
+++ b/lib/esp32s3/libfst.a
--- a/lib/esp32s3/libhufzip.a
+++ b/lib/esp32s3/libhufzip.a
--- a/lib/esp32s3/libmultinet.a
+++ b/lib/esp32s3/libmultinet.a
--- a/lib/esp32s3/libnsnet.a
+++ b/lib/esp32s3/libnsnet.a
--- a/lib/esp32s3/libwakenet.a
+++ b/lib/esp32s3/libwakenet.a
--- a/model/wakenet_model/wn9_hitelly_tts/_MODEL_INFO_
+++ b/model/wakenet_model/wn9_hitelly_tts/_MODEL_INFO_
@ -0,0 +1 @@
+wakenet9l_tts1h8_Hi,Telly or Hi,泰力_3_0.613_0.619
--- a/model/wakenet_model/wn9_hitelly_tts/wn9_data
+++ b/model/wakenet_model/wn9_hitelly_tts/wn9_data
--- a/model/wakenet_model/wn9_hitelly_tts/wn9_index
+++ b/model/wakenet_model/wn9_hitelly_tts/wn9_index
--- a/src/esp_mn_speech_commands.c
+++ b/src/esp_mn_speech_commands.c
@ -156,6 +156,64 @@ esp_err_t esp_mn_commands_add(int command_id, char *string)
    return ESP_OK;
 }

+esp_err_t esp_mn_commands_phoneme_add(int command_id, char *string, char *phonemes)
+{
+    if (NULL == esp_mn_root || esp_mn_model_handle == NULL || esp_mn_model_data == NULL) {
+        ESP_LOGE(TAG, "Please create mn model first.\n");
+        return ESP_ERR_INVALID_STATE;
+    }
+    esp_mn_node_t *temp = esp_mn_root;
+    int last_node_elem_num = esp_mn_commands_num();
+    ESP_RETURN_ON_FALSE(ESP_MN_MAX_PHRASE_NUM >= last_node_elem_num, ESP_ERR_INVALID_STATE, TAG, "The number of speech commands exceed ESP_MN_MAX_PHRASE_NUM");
+
+#ifdef CONFIG_SR_MN_EN_MULTINET7_QUANT
+    if (esp_mn_model_handle->check_speech_command(esp_mn_model_data, phonemes) == 0) {
+        // error message is printed inside check_speech_command
+        ESP_LOGE(TAG, "invalid command, please check format, %s (%s).\n", string, phonemes);
+        return ESP_ERR_INVALID_STATE;
+    }
+#else
+    if (esp_mn_model_handle->check_speech_command(esp_mn_model_data, string) == 0) {
+        // error message is printed inside check_speech_command
+        ESP_LOGE(TAG, "invalid command, please check format, %s.\n", string);
+        return ESP_ERR_INVALID_STATE;
+    }
+#endif
+
+    temp = esp_mn_command_search(string);
+
+    if (temp != NULL) {
+        // command already exists
+        if (command_id != temp->phrase->command_id) {
+            // change command id
+            temp->phrase->command_id = command_id;
+        } else {
+            // it's exactly the same, do nothing
+            ESP_LOGI(TAG, "command %d: (%s) already exists.", command_id, string);
+        }
+        return ESP_OK;
+    }
+
+    temp = esp_mn_root;
+
+    esp_mn_phrase_t *phrase = esp_mn_phrase_alloc(command_id, string);
+    if (phrase == NULL) {
+        return ESP_ERR_INVALID_STATE;
+    }
+    int phoneme_len = strlen(phonemes);
+    phrase->phonemes = _esp_mn_calloc_(phoneme_len+1, sizeof(char));
+    memcpy(phrase->phonemes, phonemes, phoneme_len);
+    phrase->phonemes[phoneme_len] = '\0';
+    
+    esp_mn_node_t *new_node = esp_mn_node_alloc(phrase);
+    while (temp->next != NULL) {
+        temp = temp->next;
+    }
+    temp->next = new_node;
+
+    return ESP_OK;
+}
+
 esp_err_t esp_mn_commands_modify(char *old_string, char *new_string)
 {
 #ifdef CONFIG_SR_MN_EN_MULTINET7_QUANT
--- a/src/include/esp_mn_speech_commands.h
+++ b/src/include/esp_mn_speech_commands.h
@ -59,6 +59,20 @@ esp_err_t esp_mn_commands_free(void);
 */
 esp_err_t esp_mn_commands_add(int command_id, char *string);

+/**
+ * @brief Add one speech commands with command string, command phonemes and command ID
+ *        Please use multinet_g2p.py[esp-sr/tool/multinet_g2p.py] to generate phonemes from command string.
+ * 
+ * @param command_id    The command ID
+ * @param string        The command string of the speech commands
+ * @param phonemes      The phonemes of the speech commands
+ *
+ * @return
+ *     - ESP_OK                  Success
+ *     - ESP_ERR_INVALID_STATE   Fail
+ */
+esp_err_t esp_mn_commands_phoneme_add(int command_id, char *string, char *phonemes);
+
 /**
 * @brief Modify one speech commands with new command string
 *
@ -178,4 +192,4 @@ void esp_mn_active_commands_print(void);

 #ifdef __cplusplus
 }
-#endif
+#endif
--- a/src/model_path.c
+++ b/src/model_path.c
@ -4,7 +4,9 @@
 #include "string.h"
 #include "model_path.h"
 #include "esp_wn_models.h"
+#ifndef CONFIG_IDF_TARGET_ESP32P4
 #include "esp_mn_models.h"
+#endif

 #ifdef ESP_PLATFORM
 #include <sys/dirent.h>
@ -244,6 +246,7 @@ void srmodel_spiffs_deinit(srmodel_list_t *models)

 }

+#ifdef CONFIG_IDF_TARGET_ESP32
 srmodel_list_t *srmodel_config_init()
 {
    if (static_srmodels == NULL) {
@ -305,6 +308,7 @@ void srmodel_config_deinit(srmodel_list_t *models)
    // models is static_srmodels
    static_srmodels = NULL;
 }
+#endif

 model_coeff_getter_t *srmodel_get_model_coeff(char *model_name)
 {
				`@ -0,0 +1 @@`
				`wakenet9l_tts1h8_Hi,Telly or Hi,泰力_3_0.613_0.619`