Merge branch 'feat/add_himfive' into 'master'

Feat/add himfive See merge request speech-recognition-framework/esp-sr!57
2025-09-15 15:28:44 +08:00 · 2023-09-13 15:59:58 +08:00 · 2023-09-13 15:59:58 +08:00 · f6e7ffcf9b
commit f6e7ffcf9b
parent 18a1afe8aa 4a5aa794ba
17 changed files with 51 additions and 24 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -4,6 +4,13 @@
 - Available storage is less than the remaining flash space on IDF v5.0.   
 If you can not map model partition successfully, please check the left free storage by `spi_flash_mmap_get_free_pages(ESP_PARTITION_MMAP_DATA)` or update IDF to v5.1 or later.

+## unrelease
+- Add esp32c6 tts lib
+- Return the volume of wake word audio when one wake word is detected
+- Reduce MultiNet6 SRAM size from 48KB to 32 KB
+- Add "Hi M Five" wake word model from M5Stack
+- Remove all MultiNet4 models
+
 ## 1.4.2
 - Reset timeout trigger of multinet6 when a new speech command is detected
 - Allocate all beams from PSRAM
--- a/Kconfig.projbuild
+++ b/Kconfig.projbuild
@ -80,6 +80,10 @@ choice SR_WN_MODEL_LOAD
    config SR_WN_WN9_HIESP
        bool "Hi,ESP (wn9_hiesp)"
        depends on IDF_TARGET_ESP32S3
+    
+    config SR_WN_WN9_HIMFIVE
+        bool "Hi,M Five (wn9_himfive)"
+        depends on IDF_TARGET_ESP32S3

    config SR_WN_WN9_NIHAOXIAOZHI
        bool "nihaoxiaozhi (wn9_nihaoxiaozhi)"
@ -122,7 +126,7 @@ config USE_MULTINET

 choice CHINESE_SR_MN_MODEL_SEL
    prompt "Chinese Speech Commands Model"
-    default SR_MN_CN_MULTINET4_5_SINGLE_RECOGNITION
+    default SR_MN_CN_MULTINET6_QUANT
    depends on USE_MULTINET
    help
        Select the Wake Word Engine to be used.
@ -134,14 +138,6 @@ choice CHINESE_SR_MN_MODEL_SEL
        bool "chinese single recognition (mn2_cn)"
        depends on IDF_TARGET_ESP32

-    config SR_MN_CN_MULTINET4_5_SINGLE_RECOGNITION
-        bool "chinese recognition (mn4_cn)"
-        depends on IDF_TARGET_ESP32S3
-
-    config SR_MN_CN_MULTINET4_5_SINGLE_RECOGNITION_QUANT8
-        bool "chinese recognition (mn4q8_cn)"
-        depends on IDF_TARGET_ESP32S3
-
    config SR_MN_CN_MULTINET5_RECOGNITION_QUANT8
        bool "chinese recognition (mn5q8_cn)"
        depends on IDF_TARGET_ESP32S3
--- a/README.md
+++ b/README.md
@ -13,7 +13,7 @@ ESP-SR framework includes the following modules:
 * [Audio Front-end AFE](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/audio_front_end/README.html)
 * [Wake Word Engine WakeNet](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/wake_word_engine/README.html)
 * [Speech Command Word Recognition MultiNet](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/speech_command_recognition/README.html)
-* Speech Synthesis (only supports Chinese language)
+* [Speech Synthesis](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/speech_synthesis/readme.html)

 These algorithms are provided in the form of a component, so they can be integrated into your projects with minimum effort.

@ -23,15 +23,15 @@ The new algorithms will no longer support ESP32 chips.

 ## Wake Word Engine

-Espressif wake word engine **WakeNet** is specially designed to provide a high performance and low memory footprint wake word detection algorithm for users, which enables devices always listen to wake words, such as “Alexa”, “Hi,lexin” and “Hi,ESP”. You can refer to **Model loading method** to build your project.
+Espressif wake word engine **WakeNet** is specially designed to provide a high performance and low memory footprint wake word detection algorithm for users, which enables devices always listen to wake words, such as “Alexa”, “Hi,lexin” and “Hi,ESP”.

-Currently, Espressif has not only provided an official wake word "Hi,Lexin","Hi,ESP" to the public for free, but also allows customized wake words. For details on how to customize your own wake words, please see **Espressif Speech Wake Words Customization Process**.
+Currently, Espressif has not only provided an official wake word "Hi,Lexin","Hi,ESP" to the public for free, but also allows customized wake words. For details on how to customize your own wake words, please see [Espressif Speech Wake Words Customization Process](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/wake_word_engine/ESP_Wake_Words_Customization.html).

 ## Speech Command Recognition

-Espressif's speech command recognition model **MultiNet** is specially designed to provide a flexible off-line speech command recognition model. With this model, you can easily add your own speech commands, eliminating the need to train model again. You can refer to **Model loading method** to build your project.
+Espressif's speech command recognition model **MultiNet** is specially designed to provide a flexible off-line speech command recognition model. With this model, you can easily add your own speech commands, eliminating the need to train model again. 

-Currently, Espressif **MultiNet** supports up to 200 Chinese or English speech commands, such as “打开空调” (Turn on the air conditioner) and “打开卧室灯” (Turn on the bedroom light).
+Currently, Espressif **MultiNet** supports up to 300 Chinese or English speech commands, such as “打开空调” (Turn on the air conditioner) and “打开卧室灯” (Turn on the bedroom light).

 ## Audio Front End

--- a/idf_component.yml
+++ b/idf_component.yml
@ -8,10 +8,4 @@ files:
  exclude:
    - ".github"
    - "docs/**/*"
-    - "test_apps/**/*"
-targets:
-  - esp32
-  - esp32s2
-  - esp32s3
-  - esp32c3
-  - esp32c6
+    - "test_apps/**/*"
--- a/include/esp32/esp_afe_config.h
+++ b/include/esp32/esp_afe_config.h
@ -22,8 +22,8 @@ typedef enum {
 } afe_memory_alloc_mode_t;

 typedef enum {
-    AFE_MN_PEAK_AGC_MODE_1 = -5,            // The peak amplitude of audio fed to multinet is -5dB
-    AFE_MN_PEAK_AGC_MODE_2 = -4,            // The peak amplitude of audio fed to multinet is -4dB
+    AFE_MN_PEAK_AGC_MODE_1 = -9,            // The peak amplitude of audio fed to multinet is -9dB
+    AFE_MN_PEAK_AGC_MODE_2 = -6,            // The peak amplitude of audio fed to multinet is -6dB
    AFE_MN_PEAK_AGC_MODE_3 = -3,            // The peak amplitude of audio fed to multinet is -3dB
    AFE_MN_PEAK_NO_AGC = 0,                 // There is no agc gain
 } afe_mn_peak_agc_mode_t;
@ -72,7 +72,9 @@ typedef struct {
    int afe_perferred_priority;
    int afe_ringbuf_size;
    afe_memory_alloc_mode_t memory_alloc_mode;
-    afe_mn_peak_agc_mode_t agc_mode;        // The agc mode for ASR
+    float afe_linear_gain;                  // The linear gain for sr output(note: invaild for vc), the value should be in [0.1, 10.0]. 
+                                            // This value acts directly on the output amplitude: out_linear_gain * amplitude.
+    afe_mn_peak_agc_mode_t agc_mode;        // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
    afe_pcm_config_t pcm_config;            // Config the channel num of original data which is fed to the afe feed function.
    bool debug_init;
    afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX];
@ -97,6 +99,7 @@ typedef struct {
    .afe_perferred_priority = 5, \
    .afe_ringbuf_size = 50, \
    .memory_alloc_mode = AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE, \
+    .afe_linear_gain = 1.0, \
    .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
    .pcm_config.total_ch_num = 2, \
    .pcm_config.mic_num = 1, \
@ -123,6 +126,7 @@ typedef struct {
    .afe_perferred_priority = 5, \
    .afe_ringbuf_size = 50, \
    .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
+    .afe_linear_gain = 1.0, \
    .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
    .pcm_config.total_ch_num = 3, \
    .pcm_config.mic_num = 2, \
--- a/include/esp32/esp_afe_sr_iface.h
+++ b/include/esp32/esp_afe_sr_iface.h
@ -25,6 +25,8 @@ typedef struct afe_fetch_result_t
 {
    int16_t *data;                          // the data of audio.
    int data_size;                          // the size of data. The unit is byte.
+    float data_volume;                      // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc).
+                                            // if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length. 
    wakenet_state_t wakeup_state;           // the value is wakenet_state_t
    int wake_word_index;                    // if the wake word is detected. It will store the wake word index which start from 1.
    int wakenet_model_index;                // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
--- a/lib/esp32/libesp_audio_front_end.a
+++ b/lib/esp32/libesp_audio_front_end.a
--- a/lib/esp32/libesp_audio_processor.a
+++ b/lib/esp32/libesp_audio_processor.a
--- a/lib/esp32/libmultinet.a
+++ b/lib/esp32/libmultinet.a
--- a/lib/esp32/libwakenet.a
+++ b/lib/esp32/libwakenet.a
--- a/lib/esp32s3/libesp_audio_front_end.a
+++ b/lib/esp32s3/libesp_audio_front_end.a
--- a/lib/esp32s3/libmultinet.a
+++ b/lib/esp32s3/libmultinet.a
--- a/lib/esp32s3/libwakenet.a
+++ b/lib/esp32s3/libwakenet.a
--- a/model/movemodel.py
+++ b/model/movemodel.py
@ -47,6 +47,28 @@ def copy_wakenet_from_sdkconfig(model_path, sdkconfig_path, target_path):
        models.append('wn9_alexa')
    if "CONFIG_SR_WN_WN9_HIESP" in models_string:
        models.append('wn9_hiesp')
+    if "CONFIG_SR_WN_WN9_HIMFIVE" in models_string:
+        models.append('wn9_himfive')
+    if "CONFIG_SR_WN_WN9_NIHAOXIAOZHI" in models_string:
+        models.append('wn9_nihaoxiaozhi')
+    if "CONFIG_SR_WN_WN9_CUSTOMWORD" in models_string:
+        models.append('wn9_customword')
+    
+    for item in models:
+        shutil.copytree(model_path + '/wakenet_model/' + item, target_path+'/'+item)
+
+def copy_multinet_from_sdkconfig(model_path, sdkconfig_path, target_path):
+    """
+    Copy multinet model from model_path to target_path based on sdkconfig
+    """
+    with io.open(sdkconfig_path, "r") as f:
+        models_string = ''
+        for label in f:
+            label = label.strip("\n")
+            if 'CONFIG_SR_MN' in label and label[0] != '#':
+                models_string += label
+
+    models = []
    if "CONFIG_SR_WN_WN9_NIHAOXIAOZHI" in models_string:
        models.append('wn9_nihaoxiaozhi')
    if "CONFIG_SR_WN_WN9_CUSTOMWORD" in models_string:
--- a/model/wakenet_model/wn9_himfive/_MODEL_INFO_
+++ b/model/wakenet_model/wn9_himfive/_MODEL_INFO_
@ -0,0 +1,2 @@
+# (neural network type)_(model data version)_(lable1_detection windown length_threshold for 90%_threshold for 95%)_(lable2 ...)_...
+wakenet9l_v2h8_himfive_3_0.640_0.645
--- a/model/wakenet_model/wn9_himfive/wn9_data
+++ b/model/wakenet_model/wn9_himfive/wn9_data
--- a/model/wakenet_model/wn9_himfive/wn9_index
+++ b/model/wakenet_model/wn9_himfive/wn9_index