Merge branch 'master' into 'doc/mn7_en_doc'

# Conflicts: # docs/en/benchmark/README.rst
2025-09-15 15:28:44 +08:00 · 2023-11-14 02:47:38 +00:00 · 2023-11-14 02:47:38 +00:00 · e80a8cb690
commit e80a8cb690
parent 0a6982f65c 88ea0c93f3
17 changed files with 1982 additions and 25 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,6 +4,12 @@
 - Available storage is less than the remaining flash space on IDF v5.0.   
 If you can not map model partition successfully, please check the left free storage by `spi_flash_mmap_get_free_pages(ESP_PARTITION_MMAP_DATA)` or update IDF to v5.1 or later.

+## 1.5.1
+- Reduce Internal RAM of multinet7
+- Update benchmark
+- Add ci build test for esp32
+- Fix some bugs
+
 ## 1.5.0
 - Add esp32c6 tts lib
 - Return the volume of wake word audio when one wake word is detected
--- a/idf_component.yml
+++ b/idf_component.yml
@ -1,4 +1,4 @@
-version: "1.5.0"
+version: "1.5.1"
 description: esp_sr provides basic algorithms for Speech Recognition applications
 url: https://github.com/espressif/esp-sr
 dependencies:
--- a/include/esp32/esp_mn_iface.h
+++ b/include/esp32/esp_mn_iface.h
@ -21,7 +21,7 @@ typedef enum {
 } esp_mn_state_t;

 //Set multinet loading mode
-//The memory comsumption is decreased with increasing mode, 
+//The memory comsumption is decreased with increasing mode,
 //As a consequence also the CPU loading rate goes up
 typedef enum {
    ESP_MN_LOAD_FROM_PSRAM = 0,          // Load all weights from PSRAM. Fastest computation with Maximum memory consumption
@ -52,6 +52,7 @@ typedef struct{

 typedef struct {
    char *string;                               // command string
+    char *phonemes;                             // command phonemes, if applicable
    int16_t command_id;                         // the command id
    float threshold;                            // trigger threshold, default: 0
    int16_t *wave;                              // prompt wave data of the phrase
@ -79,7 +80,7 @@ typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const char *model_name,

 /**
 * @brief Switch multinet mode to change memory consumption and CPU loading
- * 
+ *
 * @warning Just Support multinet6 or later versions
 *
 * @param model The model object to query
@ -109,7 +110,7 @@ typedef int (*esp_mn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
 typedef int (*esp_mn_iface_op_get_samp_chunknum_t)(model_iface_data_t *model);

 /**
- * @brief Set the detection threshold to manually abjust the probability 
+ * @brief Set the detection threshold to manually abjust the probability
 *
 * @param model The model object to query
 * @param det_treshold The threshold to trigger speech commands, the range of det_threshold is 0.0~0.9999
@ -127,7 +128,7 @@ typedef int (*esp_mn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
 /**
 * @brief Get the language of model
 *
- * @param model       The language name 
+ * @param model       The language name
 * @return Language name string defined in esp_mn_models.h, eg: ESP_MN_CHINESE, ESP_MN_ENGLISH
 */
 typedef char * (*esp_mn_iface_op_get_language_t)(model_iface_data_t *model);
@ -136,7 +137,7 @@ typedef char * (*esp_mn_iface_op_get_language_t)(model_iface_data_t *model);
 * @brief Feed samples of an audio stream to the speech recognition model and detect if there is a speech command found.
 *
 * @param model       The model object to query.
- * @param samples     An array of 16-bit signed audio samples. The array size used can be queried by the 
+ * @param samples     An array of 16-bit signed audio samples. The array size used can be queried by the
 *                    get_samp_chunksize function.
 * @return The state of multinet
 */
@ -150,10 +151,10 @@ typedef esp_mn_state_t (*esp_mn_iface_op_detect_t)(model_iface_data_t *model, in
 typedef void (*esp_mn_iface_op_destroy_t)(model_iface_data_t *model);

 /**
- * @brief Get recognition results 
+ * @brief Get recognition results
 *
 * @param model       The Model object to query
- * 
+ *
 * @return The current results.
 */
 typedef esp_mn_results_t* (*esp_mn_iface_op_get_results_t)(model_iface_data_t *model);
@ -186,14 +187,14 @@ typedef esp_mn_error_t* (*esp_wn_iface_op_set_speech_commands)(model_iface_data_

 /**
 * @brief Print out current commands in fst, note the ones "added" but not "updated" will not be shown here
- * 
+ *
 * @param model_data     The model object to query
 */
 typedef void (*esp_mn_iface_op_print_active_speech_commands)(model_iface_data_t *model_data);

 /**
 * @brief Check if input string can be tokenized
- * 
+ *
 * @param model_data     The model object to query
 * @param str            The input string
 */
@ -206,7 +207,7 @@ typedef struct {
    esp_mn_iface_op_get_samp_chunknum_t get_samp_chunknum;
    esp_mn_iface_op_set_det_threshold_t set_det_threshold;
    esp_mn_iface_op_get_language_t get_language;
-    esp_mn_iface_op_detect_t detect; 
+    esp_mn_iface_op_detect_t detect;
    esp_mn_iface_op_destroy_t destroy;
    esp_mn_iface_op_get_results_t get_results;
    esp_mn_iface_op_open_log_t open_log;
--- a/include/esp32/flite_g2p.h
+++ b/include/esp32/flite_g2p.h
@ -0,0 +1,20 @@
+#ifndef __FLITE_G2P_H__
+#define __FLITE_G2P_H__
+
+typedef struct {
+    int num_phonemes;
+    int phoneme_size;
+    char **phonemes;
+} flite_g2p_result;
+
+void flite_g2p_result_free(flite_g2p_result *result);
+
+flite_g2p_result *flite_g2p_get_result(char *grapheme);
+
+void flite_g2p_result_print_string(flite_g2p_result *result, int map_phonemes);
+
+char *flite_g2p_result_get_string(flite_g2p_result *result, int map_phonemes);
+
+char *flite_g2p(char *graphemes, int map_phonemes);
+
+#endif
--- a/lib/esp32/libesp_audio_front_end.a
+++ b/lib/esp32/libesp_audio_front_end.a
--- a/lib/esp32/libesp_audio_processor.a
+++ b/lib/esp32/libesp_audio_processor.a
--- a/lib/esp32/libflite_g2p.a
+++ b/lib/esp32/libflite_g2p.a
--- a/lib/esp32/libmultinet.a
+++ b/lib/esp32/libmultinet.a
--- a/lib/esp32/libwakenet.a
+++ b/lib/esp32/libwakenet.a
--- a/lib/esp32s3/libesp_audio_front_end.a
+++ b/lib/esp32s3/libesp_audio_front_end.a
--- a/lib/esp32s3/libmultinet.a
+++ b/lib/esp32s3/libmultinet.a
--- a/lib/esp32s3/libwakenet.a
+++ b/lib/esp32s3/libwakenet.a
--- a/src/esp_mn_speech_commands.c
+++ b/src/esp_mn_speech_commands.c
@ -12,6 +12,14 @@ static esp_mn_node_t *esp_mn_root = NULL;
 const static esp_mn_iface_t *esp_mn_model_handle = NULL;
 static model_iface_data_t *esp_mn_model_data = NULL;

+void *_esp_mn_calloc_(int n, int size)
+{
+#ifdef ESP_PLATFORM
+    return heap_caps_calloc(n, size, MALLOC_CAP_SPIRAM);
+#else
+    return calloc(n, size);
+#endif
+}

 #define ESP_RETURN_ON_FALSE(a, err_code, log_tag, format, ...) do {                             \
        if (!(a)) {                                                                             \
@ -130,7 +138,11 @@ esp_err_t esp_mn_commands_add(int command_id, char *string)
        return ESP_ERR_INVALID_STATE;
    }
 #ifdef CONFIG_SR_MN_EN_MULTINET7_QUANT
-    phrase->phonemes = phonemes;
+    int phoneme_len = strlen(phonemes);
+    phrase->phonemes = _esp_mn_calloc_(phoneme_len+1, sizeof(char));
+    memcpy(phrase->phonemes, phonemes, phoneme_len);
+    phrase->phonemes[phoneme_len] = '\0';
+    free(phonemes);
 #endif
    esp_mn_node_t *new_node = esp_mn_node_alloc(phrase);
    while (temp->next != NULL) {
@ -168,7 +180,11 @@ esp_err_t esp_mn_commands_modify(char *old_string, char *new_string)
            return ESP_ERR_INVALID_STATE;
        }
 #ifdef CONFIG_SR_MN_EN_MULTINET7_QUANT
-        phrase->phonemes = phonemes;
+        int phoneme_len = strlen(phonemes);
+        phrase->phonemes = _esp_mn_calloc_(phoneme_len+1, sizeof(char));
+        memcpy(phrase->phonemes, phonemes, phoneme_len);
+        phrase->phonemes[phoneme_len] = '\0';
+        free(phonemes);
 #endif
        esp_mn_phrase_free(temp->phrase);
        temp->phrase = phrase;
@ -297,15 +313,6 @@ void esp_mn_active_commands_print(void)
    ESP_LOGI(TAG, "---------------------------------------------------------\n");
 }

-void *_esp_mn_calloc_(int n, int size)
-{
-#ifdef ESP_PLATFORM
-    return heap_caps_calloc(n, size, MALLOC_CAP_SPIRAM);
-#else
-    return calloc(n, size);
-#endif
-}
-
 esp_mn_phrase_t *esp_mn_phrase_alloc(int command_id, char *string)
 {

--- a/test_apps/.build-rules.yml
+++ b/test_apps/.build-rules.yml
@ -1,6 +1,6 @@
 test_apps/esp-sr:
  enable:
-    - if: IDF_TARGET in ["esp32s3"]
+    - if: IDF_TARGET in ["esp32s3", "esp32"]
      temporary: false

 test_apps/esp-tts:
--- a/test_apps/esp-sr/main/CMakeLists.txt
+++ b/test_apps/esp-sr/main/CMakeLists.txt
@ -1,4 +1,3 @@
-if(IDF_TARGET STREQUAL "esp32s3")

 set(srcs
    "test_app_main.c"
@ -13,4 +12,3 @@ idf_component_register(SRCS ${srcs}
                    WHOLE_ARCHIVE)

 target_compile_options(${COMPONENT_LIB} PRIVATE "-Wno-format")
-endif()
--- a/test_apps/esp-sr/partitions_esp32.csv
+++ b/test_apps/esp-sr/partitions_esp32.csv
@ -0,0 +1,3 @@
+# Espressif ESP32 Partition Table
+# Name,  Type, SubType, Offset,  Size
+factory, app,  factory, 0x010000, 8000k
--- a/test_apps/esp-sr/sdkconfig.ci.mn2_cn
+++ b/test_apps/esp-sr/sdkconfig.ci.mn2_cn