Merge branch 'feat/add_himfive' into 'master'

Feat/add himfive

See merge request speech-recognition-framework/esp-sr!57
This commit is contained in:
Sun Xiang Yu 2023-09-13 15:59:58 +08:00
commit f6e7ffcf9b
17 changed files with 51 additions and 24 deletions

View File

@ -4,6 +4,13 @@
- Available storage is less than the remaining flash space on IDF v5.0.
If you can not map model partition successfully, please check the left free storage by `spi_flash_mmap_get_free_pages(ESP_PARTITION_MMAP_DATA)` or update IDF to v5.1 or later.
## unrelease
- Add esp32c6 tts lib
- Return the volume of wake word audio when one wake word is detected
- Reduce MultiNet6 SRAM size from 48KB to 32 KB
- Add "Hi M Five" wake word model from M5Stack
- Remove all MultiNet4 models
## 1.4.2
- Reset timeout trigger of multinet6 when a new speech command is detected
- Allocate all beams from PSRAM

View File

@ -80,6 +80,10 @@ choice SR_WN_MODEL_LOAD
config SR_WN_WN9_HIESP
bool "Hi,ESP (wn9_hiesp)"
depends on IDF_TARGET_ESP32S3
config SR_WN_WN9_HIMFIVE
bool "Hi,M Five (wn9_himfive)"
depends on IDF_TARGET_ESP32S3
config SR_WN_WN9_NIHAOXIAOZHI
bool "nihaoxiaozhi (wn9_nihaoxiaozhi)"
@ -122,7 +126,7 @@ config USE_MULTINET
choice CHINESE_SR_MN_MODEL_SEL
prompt "Chinese Speech Commands Model"
default SR_MN_CN_MULTINET4_5_SINGLE_RECOGNITION
default SR_MN_CN_MULTINET6_QUANT
depends on USE_MULTINET
help
Select the Wake Word Engine to be used.
@ -134,14 +138,6 @@ choice CHINESE_SR_MN_MODEL_SEL
bool "chinese single recognition (mn2_cn)"
depends on IDF_TARGET_ESP32
config SR_MN_CN_MULTINET4_5_SINGLE_RECOGNITION
bool "chinese recognition (mn4_cn)"
depends on IDF_TARGET_ESP32S3
config SR_MN_CN_MULTINET4_5_SINGLE_RECOGNITION_QUANT8
bool "chinese recognition (mn4q8_cn)"
depends on IDF_TARGET_ESP32S3
config SR_MN_CN_MULTINET5_RECOGNITION_QUANT8
bool "chinese recognition (mn5q8_cn)"
depends on IDF_TARGET_ESP32S3

View File

@ -13,7 +13,7 @@ ESP-SR framework includes the following modules:
* [Audio Front-end AFE](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/audio_front_end/README.html)
* [Wake Word Engine WakeNet](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/wake_word_engine/README.html)
* [Speech Command Word Recognition MultiNet](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/speech_command_recognition/README.html)
* Speech Synthesis (only supports Chinese language)
* [Speech Synthesis](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/speech_synthesis/readme.html)
These algorithms are provided in the form of a component, so they can be integrated into your projects with minimum effort.
@ -23,15 +23,15 @@ The new algorithms will no longer support ESP32 chips.
## Wake Word Engine
Espressif wake word engine **WakeNet** is specially designed to provide a high performance and low memory footprint wake word detection algorithm for users, which enables devices always listen to wake words, such as “Alexa”, “Hi,lexin” and “Hi,ESP”. You can refer to **Model loading method** to build your project.
Espressif wake word engine **WakeNet** is specially designed to provide a high performance and low memory footprint wake word detection algorithm for users, which enables devices always listen to wake words, such as “Alexa”, “Hi,lexin” and “Hi,ESP”.
Currently, Espressif has not only provided an official wake word "Hi,Lexin","Hi,ESP" to the public for free, but also allows customized wake words. For details on how to customize your own wake words, please see **Espressif Speech Wake Words Customization Process**.
Currently, Espressif has not only provided an official wake word "Hi,Lexin","Hi,ESP" to the public for free, but also allows customized wake words. For details on how to customize your own wake words, please see [Espressif Speech Wake Words Customization Process](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/wake_word_engine/ESP_Wake_Words_Customization.html).
## Speech Command Recognition
Espressif's speech command recognition model **MultiNet** is specially designed to provide a flexible off-line speech command recognition model. With this model, you can easily add your own speech commands, eliminating the need to train model again. You can refer to **Model loading method** to build your project.
Espressif's speech command recognition model **MultiNet** is specially designed to provide a flexible off-line speech command recognition model. With this model, you can easily add your own speech commands, eliminating the need to train model again.
Currently, Espressif **MultiNet** supports up to 200 Chinese or English speech commands, such as “打开空调” (Turn on the air conditioner) and “打开卧室灯” (Turn on the bedroom light).
Currently, Espressif **MultiNet** supports up to 300 Chinese or English speech commands, such as “打开空调” (Turn on the air conditioner) and “打开卧室灯” (Turn on the bedroom light).
## Audio Front End

View File

@ -8,10 +8,4 @@ files:
exclude:
- ".github"
- "docs/**/*"
- "test_apps/**/*"
targets:
- esp32
- esp32s2
- esp32s3
- esp32c3
- esp32c6
- "test_apps/**/*"

View File

@ -22,8 +22,8 @@ typedef enum {
} afe_memory_alloc_mode_t;
typedef enum {
AFE_MN_PEAK_AGC_MODE_1 = -5, // The peak amplitude of audio fed to multinet is -5dB
AFE_MN_PEAK_AGC_MODE_2 = -4, // The peak amplitude of audio fed to multinet is -4dB
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of audio fed to multinet is -9dB
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of audio fed to multinet is -6dB
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of audio fed to multinet is -3dB
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
} afe_mn_peak_agc_mode_t;
@ -72,7 +72,9 @@ typedef struct {
int afe_perferred_priority;
int afe_ringbuf_size;
afe_memory_alloc_mode_t memory_alloc_mode;
afe_mn_peak_agc_mode_t agc_mode; // The agc mode for ASR
float afe_linear_gain; // The linear gain for sr output(note: invaild for vc), the value should be in [0.1, 10.0].
// This value acts directly on the output amplitude: out_linear_gain * amplitude.
afe_mn_peak_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
bool debug_init;
afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX];
@ -97,6 +99,7 @@ typedef struct {
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.memory_alloc_mode = AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE, \
.afe_linear_gain = 1.0, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config.total_ch_num = 2, \
.pcm_config.mic_num = 1, \
@ -123,6 +126,7 @@ typedef struct {
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
.afe_linear_gain = 1.0, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config.total_ch_num = 3, \
.pcm_config.mic_num = 2, \

View File

@ -25,6 +25,8 @@ typedef struct afe_fetch_result_t
{
int16_t *data; // the data of audio.
int data_size; // the size of data. The unit is byte.
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc).
// if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length.
wakenet_state_t wakeup_state; // the value is wakenet_state_t
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -47,6 +47,28 @@ def copy_wakenet_from_sdkconfig(model_path, sdkconfig_path, target_path):
models.append('wn9_alexa')
if "CONFIG_SR_WN_WN9_HIESP" in models_string:
models.append('wn9_hiesp')
if "CONFIG_SR_WN_WN9_HIMFIVE" in models_string:
models.append('wn9_himfive')
if "CONFIG_SR_WN_WN9_NIHAOXIAOZHI" in models_string:
models.append('wn9_nihaoxiaozhi')
if "CONFIG_SR_WN_WN9_CUSTOMWORD" in models_string:
models.append('wn9_customword')
for item in models:
shutil.copytree(model_path + '/wakenet_model/' + item, target_path+'/'+item)
def copy_multinet_from_sdkconfig(model_path, sdkconfig_path, target_path):
"""
Copy multinet model from model_path to target_path based on sdkconfig
"""
with io.open(sdkconfig_path, "r") as f:
models_string = ''
for label in f:
label = label.strip("\n")
if 'CONFIG_SR_MN' in label and label[0] != '#':
models_string += label
models = []
if "CONFIG_SR_WN_WN9_NIHAOXIAOZHI" in models_string:
models.append('wn9_nihaoxiaozhi')
if "CONFIG_SR_WN_WN9_CUSTOMWORD" in models_string:

View File

@ -0,0 +1,2 @@
# (neural network type)_(model data version)_(lable1_detection windown length_threshold for 90%_threshold for 95%)_(lable2 ...)_...
wakenet9l_v2h8_himfive_3_0.640_0.645

Binary file not shown.

Binary file not shown.