diff --git a/CHANGELOG.md b/CHANGELOG.md index 57cdeab..299a952 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Change log for esp-sr +## 2.1.1 +- Add 8KHz AEC for VoIP +- Add more wakenet9 models + ## 2.1.0 - esp32c3 support wakenet9s and aec - esp32c5 support wakenet9s and aec diff --git a/Kconfig.projbuild b/Kconfig.projbuild index ee957f3..77a29e6 100644 --- a/Kconfig.projbuild +++ b/Kconfig.projbuild @@ -207,6 +207,10 @@ menu "Load Multiple Wake Words (WakeNet9)" bool "小特小特 (wn9_xiaotexiaote_tts2)" default False + config SR_WN_WN9_NIHAOXIAOYI_TTS2 + bool "你好小益 (wn9_nihaoxiaoyi_tts2)" + default False + config SR_WN_WN9_HIWALLE_TTS2 bool "Hi Wall E or Hi 瓦力(wn9_hiwalle_tts2)" default False diff --git a/README.md b/README.md index 772f5aa..b9dd3a3 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ [![Documentation Status](./docs/_static/sr_doc_latest.svg)](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/index.html) [![Component Registry](https://components.espressif.com/components/espressif/esp-sr/badge.svg)](https://components.espressif.com/components/espressif/esp-sr) -Espressif [ESP-SR](https://github.com/espressif/esp-sr) helps users build AI speech solutions based on ESP32-S3 or ESP32-P4 chips. +Espressif [ESP-SR](https://github.com/espressif/esp-sr) helps users build AI speech solutions. Overview -------- @@ -18,18 +18,19 @@ ESP-SR framework includes the following modules: These algorithms are provided in the form of a component, so they can be integrated into your projects with minimum effort. -ESP32-S3/ESP32-P4 are recommended, which support AI instructions and larger, high-speed octal SPI PSRAM. -The new algorithms will no longer support ESP32 chips. News ---- -[21/4/2025]: We add a new model WakeNet9s, which can run on chips that do not have PSRAM and do not support SIMD, such as ESP32C3 and ESP32C5. +[21/4/2025]: We add a new model WakeNet9s, which can run on chips that do not have PSRAM and do not support SIMD, such as ESP32C3 and ESP32C5. [examples](https://github.com/espressif/esp-skainet/tree/master/examples/wake_word_detection) [17/4/2025]: We add a new DOA(Direction of Arrival) algorithm. [14/2/2025]: We release **ESP-SR V2.0**. [Migration from ESP-SR V1.* to ESP-SR V2.*](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/audio_front_end/migration_guide.html) [13/2/2025]: We release **VADNet**, a voice activaty detection model. You can use it to replace the WebRTC VAD and improve the performance. ## Wake Word Engine +| Supported Targets | ESP32 | ESP32-S2 | ESP32-S3 | ESP32-P4 | ESP32-C3 | ESP32-C5 | ESP32-C6 | +| ----------------- | -------- | -------- | -------- | -------- | -------- | -------- | -------- | + Espressif wake word engine **WakeNet** is specially designed to provide a high performance and low memory footprint wake word detection algorithm for users, which enables devices always listen to wake words, such as “Alexa”, “Hi,lexin” and “Hi,ESP”. WakeNet9 and WakeNet9s models are supported. WakeNet9s is a cost-down version of WakeNet9, with fewer parameters and lower computational requirements. Espressif offers two ways to customize the wake word, please refer to the following document to choose the one that meets your needs: @@ -74,11 +75,15 @@ The following wake words are supported in esp-sr: |小康同学 | | wn9_xiaokangtongxue_tts2| |小箭小箭 | | wn9_xiaojianxiaojian_tts2| |小特小特 | | wn9_xiaotexiaote_tts2| +|你好小益 | | wn9_nihaoxiaoyi_tts2| *NOTE:* `_tts` suffix means this WakeNet model is trained by TTS samples. `_tts2` suffix means this WakeNet model is trained by TTS Pipeline V2. ## Speech Command Recognition +| Supported Targets | ESP32 | ESP32-S3 | ESP32-P4 | +| ----------------- | -------- | -------- | -------- | + Espressif's speech command recognition model **MultiNet** is specially designed to provide a flexible off-line speech command recognition model. With this model, you can easily add your own speech commands, eliminating the need to train model again. Currently, Espressif **MultiNet** supports up to 300 Chinese or English speech commands, such as “打开空调” (Turn on the air conditioner) and “打开卧室灯” (Turn on the bedroom light). @@ -92,6 +97,9 @@ The following MultiNet models are supported in esp-sr: ## Audio Front End +| Supported Targets | ESP32 | ESP32-S3 | ESP32-P4 | +| ----------------- | -------- | -------- | -------- | + Espressif Audio Front-End **AFE** integrates AEC (Acoustic Echo Cancellation), VAD (Voice Activity Detection), BSS (Blind Source Separation) and NS (Noise Suppression), NSNET(Deep noise suppression) and other functions. It is designed to be used with the ESP-SR library. Our two-mic Audio Front-End (AFE) have been qualified as a “Software Audio Front-End Solution” for [Amazon Alexa Built-in devices](https://developer.amazon.com/en-US/alexa/solution-providers/alexa-connect-kit). diff --git a/idf_component.yml b/idf_component.yml index ad6b2ae..bc6e0d2 100644 --- a/idf_component.yml +++ b/idf_component.yml @@ -1,4 +1,4 @@ -version: "2.1.0" +version: "2.1.1" description: esp_sr provides basic algorithms for Speech Recognition applications url: https://github.com/espressif/esp-sr dependencies: diff --git a/include/esp32/esp_afe_config.h b/include/esp32/esp_afe_config.h index 5bb8311..9fcb743 100644 --- a/include/esp32/esp_afe_config.h +++ b/include/esp32/esp_afe_config.h @@ -33,7 +33,8 @@ typedef enum { // Set AFE type typedef enum { AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression - AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression + AFE_TYPE_VC = 1, // Voice communication scenarios, 16KHz input, including nonlinear noise suppression + AFE_TYPE_VC_8K = 2, // Voice communication scenarios, 8KHz input, note that the input data must be 8KHz } afe_type_t; typedef enum { diff --git a/include/esp32p4/esp_afe_config.h b/include/esp32p4/esp_afe_config.h index 5bb8311..9fcb743 100644 --- a/include/esp32p4/esp_afe_config.h +++ b/include/esp32p4/esp_afe_config.h @@ -33,7 +33,8 @@ typedef enum { // Set AFE type typedef enum { AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression - AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression + AFE_TYPE_VC = 1, // Voice communication scenarios, 16KHz input, including nonlinear noise suppression + AFE_TYPE_VC_8K = 2, // Voice communication scenarios, 8KHz input, note that the input data must be 8KHz } afe_type_t; typedef enum { diff --git a/include/esp32s3/esp_afe_config.h b/include/esp32s3/esp_afe_config.h index 5bb8311..9fcb743 100644 --- a/include/esp32s3/esp_afe_config.h +++ b/include/esp32s3/esp_afe_config.h @@ -33,7 +33,8 @@ typedef enum { // Set AFE type typedef enum { AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression - AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression + AFE_TYPE_VC = 1, // Voice communication scenarios, 16KHz input, including nonlinear noise suppression + AFE_TYPE_VC_8K = 2, // Voice communication scenarios, 8KHz input, note that the input data must be 8KHz } afe_type_t; typedef enum { diff --git a/lib/esp32/libc_speech_features.a b/lib/esp32/libc_speech_features.a index c46a3c5..73ef41f 100644 Binary files a/lib/esp32/libc_speech_features.a and b/lib/esp32/libc_speech_features.a differ diff --git a/lib/esp32/libdl_lib.a b/lib/esp32/libdl_lib.a index 460f110..5746337 100644 Binary files a/lib/esp32/libdl_lib.a and b/lib/esp32/libdl_lib.a differ diff --git a/lib/esp32/libesp_audio_front_end.a b/lib/esp32/libesp_audio_front_end.a index 5e2188f..e4d546d 100644 Binary files a/lib/esp32/libesp_audio_front_end.a and b/lib/esp32/libesp_audio_front_end.a differ diff --git a/lib/esp32/libesp_audio_processor.a b/lib/esp32/libesp_audio_processor.a index 0e99223..05226dc 100644 Binary files a/lib/esp32/libesp_audio_processor.a and b/lib/esp32/libesp_audio_processor.a differ diff --git a/lib/esp32/libflite_g2p.a b/lib/esp32/libflite_g2p.a index f29e5ed..83ab839 100644 Binary files a/lib/esp32/libflite_g2p.a and b/lib/esp32/libflite_g2p.a differ diff --git a/lib/esp32/libfst.a b/lib/esp32/libfst.a index b255771..7ed2b85 100644 Binary files a/lib/esp32/libfst.a and b/lib/esp32/libfst.a differ diff --git a/lib/esp32/libhufzip.a b/lib/esp32/libhufzip.a index ed3e5ff..7ea8db5 100644 Binary files a/lib/esp32/libhufzip.a and b/lib/esp32/libhufzip.a differ diff --git a/lib/esp32/libmultinet.a b/lib/esp32/libmultinet.a index f234f6c..44b9133 100644 Binary files a/lib/esp32/libmultinet.a and b/lib/esp32/libmultinet.a differ diff --git a/lib/esp32/libnsnet.a b/lib/esp32/libnsnet.a index 3798329..4935af6 100644 Binary files a/lib/esp32/libnsnet.a and b/lib/esp32/libnsnet.a differ diff --git a/lib/esp32/libvadnet.a b/lib/esp32/libvadnet.a index c7c2dd4..9d5446f 100644 Binary files a/lib/esp32/libvadnet.a and b/lib/esp32/libvadnet.a differ diff --git a/lib/esp32/libwakenet.a b/lib/esp32/libwakenet.a index b3eae60..cd16bfb 100644 Binary files a/lib/esp32/libwakenet.a and b/lib/esp32/libwakenet.a differ diff --git a/lib/esp32p4/libesp_audio_front_end.a b/lib/esp32p4/libesp_audio_front_end.a index da7dc0e..dfedc11 100644 Binary files a/lib/esp32p4/libesp_audio_front_end.a and b/lib/esp32p4/libesp_audio_front_end.a differ diff --git a/lib/esp32p4/libesp_audio_processor.a b/lib/esp32p4/libesp_audio_processor.a index 0c7835b..e31b0a0 100644 Binary files a/lib/esp32p4/libesp_audio_processor.a and b/lib/esp32p4/libesp_audio_processor.a differ diff --git a/lib/esp32p4/libmultinet.a b/lib/esp32p4/libmultinet.a index 30618de..0cbaf88 100644 Binary files a/lib/esp32p4/libmultinet.a and b/lib/esp32p4/libmultinet.a differ diff --git a/lib/esp32p4/libvadnet.a b/lib/esp32p4/libvadnet.a index 5fb6605..aa90d93 100644 Binary files a/lib/esp32p4/libvadnet.a and b/lib/esp32p4/libvadnet.a differ diff --git a/lib/esp32p4/libwakenet.a b/lib/esp32p4/libwakenet.a index b9393ac..41573aa 100644 Binary files a/lib/esp32p4/libwakenet.a and b/lib/esp32p4/libwakenet.a differ diff --git a/lib/esp32s3/libc_speech_features.a b/lib/esp32s3/libc_speech_features.a index 0f6a42c..c8bc04c 100644 Binary files a/lib/esp32s3/libc_speech_features.a and b/lib/esp32s3/libc_speech_features.a differ diff --git a/lib/esp32s3/libdl_lib.a b/lib/esp32s3/libdl_lib.a index b79c29c..4d8ddb2 100644 Binary files a/lib/esp32s3/libdl_lib.a and b/lib/esp32s3/libdl_lib.a differ diff --git a/lib/esp32s3/libesp_audio_front_end.a b/lib/esp32s3/libesp_audio_front_end.a index e6c16e6..84f6f0c 100644 Binary files a/lib/esp32s3/libesp_audio_front_end.a and b/lib/esp32s3/libesp_audio_front_end.a differ diff --git a/lib/esp32s3/libesp_audio_processor.a b/lib/esp32s3/libesp_audio_processor.a index 6c40fe5..7a173d8 100644 Binary files a/lib/esp32s3/libesp_audio_processor.a and b/lib/esp32s3/libesp_audio_processor.a differ diff --git a/lib/esp32s3/libflite_g2p.a b/lib/esp32s3/libflite_g2p.a index e10f653..3f26e16 100644 Binary files a/lib/esp32s3/libflite_g2p.a and b/lib/esp32s3/libflite_g2p.a differ diff --git a/lib/esp32s3/libfst.a b/lib/esp32s3/libfst.a index e54b1d2..37f0b4c 100644 Binary files a/lib/esp32s3/libfst.a and b/lib/esp32s3/libfst.a differ diff --git a/lib/esp32s3/libhufzip.a b/lib/esp32s3/libhufzip.a index 00257f0..dc04c45 100644 Binary files a/lib/esp32s3/libhufzip.a and b/lib/esp32s3/libhufzip.a differ diff --git a/lib/esp32s3/libmultinet.a b/lib/esp32s3/libmultinet.a index a75b1a8..39fbd2b 100644 Binary files a/lib/esp32s3/libmultinet.a and b/lib/esp32s3/libmultinet.a differ diff --git a/lib/esp32s3/libnsnet.a b/lib/esp32s3/libnsnet.a index 8315b89..75e6edc 100644 Binary files a/lib/esp32s3/libnsnet.a and b/lib/esp32s3/libnsnet.a differ diff --git a/lib/esp32s3/libvadnet.a b/lib/esp32s3/libvadnet.a index 6ea55c4..547feaa 100644 Binary files a/lib/esp32s3/libvadnet.a and b/lib/esp32s3/libvadnet.a differ diff --git a/lib/esp32s3/libwakenet.a b/lib/esp32s3/libwakenet.a index b757e26..8ee93d5 100644 Binary files a/lib/esp32s3/libwakenet.a and b/lib/esp32s3/libwakenet.a differ diff --git a/model/wakenet_model/wn9_nihaoxiaoyi_tts2/_MODEL_INFO_ b/model/wakenet_model/wn9_nihaoxiaoyi_tts2/_MODEL_INFO_ new file mode 100644 index 0000000..f4ebcb7 --- /dev/null +++ b/model/wakenet_model/wn9_nihaoxiaoyi_tts2/_MODEL_INFO_ @@ -0,0 +1 @@ +wakenet9l_tts2h12_你好小益_3_0.635_0.638 diff --git a/model/wakenet_model/wn9_nihaoxiaoyi_tts2/wn9_data b/model/wakenet_model/wn9_nihaoxiaoyi_tts2/wn9_data new file mode 100644 index 0000000..067cdb5 Binary files /dev/null and b/model/wakenet_model/wn9_nihaoxiaoyi_tts2/wn9_data differ diff --git a/model/wakenet_model/wn9_nihaoxiaoyi_tts2/wn9_index b/model/wakenet_model/wn9_nihaoxiaoyi_tts2/wn9_index new file mode 100644 index 0000000..802f87b Binary files /dev/null and b/model/wakenet_model/wn9_nihaoxiaoyi_tts2/wn9_index differ