diff --git a/CHANGELOG.md b/CHANGELOG.md index 97eebd6..2c3b5bf 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,13 @@ - Available storage is less than the remaining flash space on IDF v5.0. If you can not map model partition successfully, please check the left free storage by `spi_flash_mmap_get_free_pages(ESP_PARTITION_MMAP_DATA)` or update IDF to v5.1 or later. +## unrelease +- Add esp32c6 tts lib +- Return the volume of wake word audio when one wake word is detected +- Reduce MultiNet6 SRAM size from 48KB to 32 KB +- Add "Hi M Five" wake word model from M5Stack +- Remove all MultiNet4 models + ## 1.4.2 - Reset timeout trigger of multinet6 when a new speech command is detected - Allocate all beams from PSRAM diff --git a/Kconfig.projbuild b/Kconfig.projbuild index 64eacfd..d1d59e5 100644 --- a/Kconfig.projbuild +++ b/Kconfig.projbuild @@ -80,6 +80,10 @@ choice SR_WN_MODEL_LOAD config SR_WN_WN9_HIESP bool "Hi,ESP (wn9_hiesp)" depends on IDF_TARGET_ESP32S3 + + config SR_WN_WN9_HIMFIVE + bool "Hi,M Five (wn9_himfive)" + depends on IDF_TARGET_ESP32S3 config SR_WN_WN9_NIHAOXIAOZHI bool "nihaoxiaozhi (wn9_nihaoxiaozhi)" @@ -122,7 +126,7 @@ config USE_MULTINET choice CHINESE_SR_MN_MODEL_SEL prompt "Chinese Speech Commands Model" - default SR_MN_CN_MULTINET4_5_SINGLE_RECOGNITION + default SR_MN_CN_MULTINET6_QUANT depends on USE_MULTINET help Select the Wake Word Engine to be used. @@ -134,14 +138,6 @@ choice CHINESE_SR_MN_MODEL_SEL bool "chinese single recognition (mn2_cn)" depends on IDF_TARGET_ESP32 - config SR_MN_CN_MULTINET4_5_SINGLE_RECOGNITION - bool "chinese recognition (mn4_cn)" - depends on IDF_TARGET_ESP32S3 - - config SR_MN_CN_MULTINET4_5_SINGLE_RECOGNITION_QUANT8 - bool "chinese recognition (mn4q8_cn)" - depends on IDF_TARGET_ESP32S3 - config SR_MN_CN_MULTINET5_RECOGNITION_QUANT8 bool "chinese recognition (mn5q8_cn)" depends on IDF_TARGET_ESP32S3 diff --git a/README.md b/README.md index 51f6c63..8169eca 100644 --- a/README.md +++ b/README.md @@ -13,7 +13,7 @@ ESP-SR framework includes the following modules: * [Audio Front-end AFE](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/audio_front_end/README.html) * [Wake Word Engine WakeNet](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/wake_word_engine/README.html) * [Speech Command Word Recognition MultiNet](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/speech_command_recognition/README.html) -* Speech Synthesis (only supports Chinese language) +* [Speech Synthesis](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/speech_synthesis/readme.html) These algorithms are provided in the form of a component, so they can be integrated into your projects with minimum effort. @@ -23,15 +23,15 @@ The new algorithms will no longer support ESP32 chips. ## Wake Word Engine -Espressif wake word engine **WakeNet** is specially designed to provide a high performance and low memory footprint wake word detection algorithm for users, which enables devices always listen to wake words, such as “Alexa”, “Hi,lexin” and “Hi,ESP”. You can refer to **Model loading method** to build your project. +Espressif wake word engine **WakeNet** is specially designed to provide a high performance and low memory footprint wake word detection algorithm for users, which enables devices always listen to wake words, such as “Alexa”, “Hi,lexin” and “Hi,ESP”. -Currently, Espressif has not only provided an official wake word "Hi,Lexin","Hi,ESP" to the public for free, but also allows customized wake words. For details on how to customize your own wake words, please see **Espressif Speech Wake Words Customization Process**. +Currently, Espressif has not only provided an official wake word "Hi,Lexin","Hi,ESP" to the public for free, but also allows customized wake words. For details on how to customize your own wake words, please see [Espressif Speech Wake Words Customization Process](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/wake_word_engine/ESP_Wake_Words_Customization.html). ## Speech Command Recognition -Espressif's speech command recognition model **MultiNet** is specially designed to provide a flexible off-line speech command recognition model. With this model, you can easily add your own speech commands, eliminating the need to train model again. You can refer to **Model loading method** to build your project. +Espressif's speech command recognition model **MultiNet** is specially designed to provide a flexible off-line speech command recognition model. With this model, you can easily add your own speech commands, eliminating the need to train model again. -Currently, Espressif **MultiNet** supports up to 200 Chinese or English speech commands, such as “打开空调” (Turn on the air conditioner) and “打开卧室灯” (Turn on the bedroom light). +Currently, Espressif **MultiNet** supports up to 300 Chinese or English speech commands, such as “打开空调” (Turn on the air conditioner) and “打开卧室灯” (Turn on the bedroom light). ## Audio Front End diff --git a/idf_component.yml b/idf_component.yml index 3e2ef2c..e47059e 100644 --- a/idf_component.yml +++ b/idf_component.yml @@ -8,10 +8,4 @@ files: exclude: - ".github" - "docs/**/*" - - "test_apps/**/*" -targets: - - esp32 - - esp32s2 - - esp32s3 - - esp32c3 - - esp32c6 + - "test_apps/**/*" \ No newline at end of file diff --git a/include/esp32/esp_afe_config.h b/include/esp32/esp_afe_config.h index 9af4eb2..1291b5c 100644 --- a/include/esp32/esp_afe_config.h +++ b/include/esp32/esp_afe_config.h @@ -22,8 +22,8 @@ typedef enum { } afe_memory_alloc_mode_t; typedef enum { - AFE_MN_PEAK_AGC_MODE_1 = -5, // The peak amplitude of audio fed to multinet is -5dB - AFE_MN_PEAK_AGC_MODE_2 = -4, // The peak amplitude of audio fed to multinet is -4dB + AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of audio fed to multinet is -9dB + AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of audio fed to multinet is -6dB AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of audio fed to multinet is -3dB AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain } afe_mn_peak_agc_mode_t; @@ -72,7 +72,9 @@ typedef struct { int afe_perferred_priority; int afe_ringbuf_size; afe_memory_alloc_mode_t memory_alloc_mode; - afe_mn_peak_agc_mode_t agc_mode; // The agc mode for ASR + float afe_linear_gain; // The linear gain for sr output(note: invaild for vc), the value should be in [0.1, 10.0]. + // This value acts directly on the output amplitude: out_linear_gain * amplitude. + afe_mn_peak_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain. afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function. bool debug_init; afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX]; @@ -97,6 +99,7 @@ typedef struct { .afe_perferred_priority = 5, \ .afe_ringbuf_size = 50, \ .memory_alloc_mode = AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE, \ + .afe_linear_gain = 1.0, \ .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \ .pcm_config.total_ch_num = 2, \ .pcm_config.mic_num = 1, \ @@ -123,6 +126,7 @@ typedef struct { .afe_perferred_priority = 5, \ .afe_ringbuf_size = 50, \ .memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \ + .afe_linear_gain = 1.0, \ .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \ .pcm_config.total_ch_num = 3, \ .pcm_config.mic_num = 2, \ diff --git a/include/esp32/esp_afe_sr_iface.h b/include/esp32/esp_afe_sr_iface.h index 276e493..d45c118 100644 --- a/include/esp32/esp_afe_sr_iface.h +++ b/include/esp32/esp_afe_sr_iface.h @@ -25,6 +25,8 @@ typedef struct afe_fetch_result_t { int16_t *data; // the data of audio. int data_size; // the size of data. The unit is byte. + float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc). + // if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length. wakenet_state_t wakeup_state; // the value is wakenet_state_t int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1. int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1. diff --git a/lib/esp32/libesp_audio_front_end.a b/lib/esp32/libesp_audio_front_end.a index e078075..ea34fdf 100644 Binary files a/lib/esp32/libesp_audio_front_end.a and b/lib/esp32/libesp_audio_front_end.a differ diff --git a/lib/esp32/libesp_audio_processor.a b/lib/esp32/libesp_audio_processor.a index 1451c7f..363df18 100644 Binary files a/lib/esp32/libesp_audio_processor.a and b/lib/esp32/libesp_audio_processor.a differ diff --git a/lib/esp32/libmultinet.a b/lib/esp32/libmultinet.a index 1c49d5e..b31d5e3 100644 Binary files a/lib/esp32/libmultinet.a and b/lib/esp32/libmultinet.a differ diff --git a/lib/esp32/libwakenet.a b/lib/esp32/libwakenet.a index 7799b99..7da3d76 100644 Binary files a/lib/esp32/libwakenet.a and b/lib/esp32/libwakenet.a differ diff --git a/lib/esp32s3/libesp_audio_front_end.a b/lib/esp32s3/libesp_audio_front_end.a index 200d7dd..faa177e 100644 Binary files a/lib/esp32s3/libesp_audio_front_end.a and b/lib/esp32s3/libesp_audio_front_end.a differ diff --git a/lib/esp32s3/libmultinet.a b/lib/esp32s3/libmultinet.a index 8b3bef5..cd4880b 100644 Binary files a/lib/esp32s3/libmultinet.a and b/lib/esp32s3/libmultinet.a differ diff --git a/lib/esp32s3/libwakenet.a b/lib/esp32s3/libwakenet.a index 6be6a7c..b578cd1 100644 Binary files a/lib/esp32s3/libwakenet.a and b/lib/esp32s3/libwakenet.a differ diff --git a/model/movemodel.py b/model/movemodel.py index e1a5ca4..a618ff5 100644 --- a/model/movemodel.py +++ b/model/movemodel.py @@ -47,6 +47,28 @@ def copy_wakenet_from_sdkconfig(model_path, sdkconfig_path, target_path): models.append('wn9_alexa') if "CONFIG_SR_WN_WN9_HIESP" in models_string: models.append('wn9_hiesp') + if "CONFIG_SR_WN_WN9_HIMFIVE" in models_string: + models.append('wn9_himfive') + if "CONFIG_SR_WN_WN9_NIHAOXIAOZHI" in models_string: + models.append('wn9_nihaoxiaozhi') + if "CONFIG_SR_WN_WN9_CUSTOMWORD" in models_string: + models.append('wn9_customword') + + for item in models: + shutil.copytree(model_path + '/wakenet_model/' + item, target_path+'/'+item) + +def copy_multinet_from_sdkconfig(model_path, sdkconfig_path, target_path): + """ + Copy multinet model from model_path to target_path based on sdkconfig + """ + with io.open(sdkconfig_path, "r") as f: + models_string = '' + for label in f: + label = label.strip("\n") + if 'CONFIG_SR_MN' in label and label[0] != '#': + models_string += label + + models = [] if "CONFIG_SR_WN_WN9_NIHAOXIAOZHI" in models_string: models.append('wn9_nihaoxiaozhi') if "CONFIG_SR_WN_WN9_CUSTOMWORD" in models_string: diff --git a/model/wakenet_model/wn9_himfive/_MODEL_INFO_ b/model/wakenet_model/wn9_himfive/_MODEL_INFO_ new file mode 100644 index 0000000..1e18dd6 --- /dev/null +++ b/model/wakenet_model/wn9_himfive/_MODEL_INFO_ @@ -0,0 +1,2 @@ +# (neural network type)_(model data version)_(lable1_detection windown length_threshold for 90%_threshold for 95%)_(lable2 ...)_... +wakenet9l_v2h8_himfive_3_0.640_0.645 diff --git a/model/wakenet_model/wn9_himfive/wn9_data b/model/wakenet_model/wn9_himfive/wn9_data new file mode 100644 index 0000000..990cdc6 Binary files /dev/null and b/model/wakenet_model/wn9_himfive/wn9_data differ diff --git a/model/wakenet_model/wn9_himfive/wn9_index b/model/wakenet_model/wn9_himfive/wn9_index new file mode 100644 index 0000000..a4dd205 Binary files /dev/null and b/model/wakenet_model/wn9_himfive/wn9_index differ