diff --git a/Kconfig.projbuild b/Kconfig.projbuild index 146b3e9..2fcc7fb 100644 --- a/Kconfig.projbuild +++ b/Kconfig.projbuild @@ -188,6 +188,10 @@ menu "Load Multiple Wake Words" config SR_WN_WN9_XIAOSUROU_TTS2 bool "小酥肉 (wn9_xiaosurou_tts2)" default False + + config SR_WN_WN9_XIAOYUTONGXUE_TTS2 + bool "小宇同学 (wn9_xiaoyutongxue_tts2)" + default False endmenu diff --git a/README.md b/README.md index 898cdcd..3e291c8 100644 --- a/README.md +++ b/README.md @@ -12,6 +12,7 @@ ESP-SR framework includes the following modules: * [Audio Front-end AFE](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/audio_front_end/README.html) * [Wake Word Engine WakeNet](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/wake_word_engine/README.html) +* [VAD VADNet](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/vadnet/README.html) * [Speech Command Word Recognition MultiNet](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/speech_command_recognition/README.html) * [Speech Synthesis](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/speech_synthesis/readme.html) @@ -61,6 +62,7 @@ The following wake words are supported in esp-sr: |小鸭小鸭 | | wn9_xiaoyaxiaoya_tts2 | |璃奈板 | | wn9_linaiban_tts2 | |小酥肉 | | wn9_xiaosurou_tts2 | +|小宇同学 | | wn9_小宇同学_tts2 | *NOTE:* `_tts` suffix means this WakeNet model is trained by TTS samples. `_tts2` suffix means this WakeNet model is trained by TTS Pipeline V2. @@ -83,6 +85,7 @@ Espressif Audio Front-End **AFE** integrates AEC (Acoustic Echo Cancellation), V Our two-mic Audio Front-End (AFE) have been qualified as a “Software Audio Front-End Solution” for [Amazon Alexa Built-in devices](https://developer.amazon.com/en-US/alexa/solution-providers/dev-kits#software-audio-front-end-dev-kits). +Now AFE V2.0 has been released, which is more efficient than AFE V1.0. and supports more models. **In order to achieve optimal performance:** diff --git a/docs/en/index.rst b/docs/en/index.rst index 5c83d67..b8135df 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -16,6 +16,7 @@ ESP-SR User Guide Getting Started Audio Front-end (AFE) Wake Word WakeNet + VAD Model vadnet Speech Command Word MultiNet Speech Synthesis (Only Supports Chinese Language) Flashing Models diff --git a/docs/en/vadnet/readme.rst b/docs/en/vadnet/readme.rst new file mode 100644 index 0000000..e01e3d0 --- /dev/null +++ b/docs/en/vadnet/readme.rst @@ -0,0 +1,69 @@ +Voice Activaty Detection Model +============================== + +:link_to_translation:`zh_CN:[中文]` + +VADNet is a Voice Activaty Detection model built upon neural network for low-power embedded MCUs. + +Overview +-------- + +VADNet uses a model structure and data processing flow similar to WakeNet, for more details, you can refer to :doc:`AFE <../wake_word_engine/README>` + +VADNet is trained by about 5,000 hours of Chinese data, 5,000 hours of English data, and 5,000 hours of multilingual data. + + +Use VADNet +----------- + +- Select VADNet model + + To select VADNet model, please refer to Section :doc:`Flashing Models <../flash_model/README>` . + +- Run VADNet + + VADNet is currently included in the :doc:`AFE <../audio_front_end/README>`, which is enabled by default, and returns the detection results through the AFE fetch interface. + + The common vad setting is as follows: + + :: + + afe_config->vad_init = true // Whether to initial vad in AFE pipeline. Default is true. + afe_config->vad_min_noise_ms = 1000; // The minimum duration of noise or silence in ms. + afe_config->vad_min_speech_ms = 128; // The minimum duration of speech in ms. + afe_config->vad_delay_ms = 128; // The delay between the first frame trigger of VAD and the first frame of speech data. + afe_config->vad_mode = VAD_MODE_1; // The larger the mode, the higher the speech trigger probability. + + If users want to enable/disable/reset VADNet temporarily, please use: + + :: + + afe_handle->disable_vad(afe_data); // disable VADNet + afe_handle->enable_vad(afe_data); // enable VADNet + afe_handle->reset_vad(afe_data); // reset VADNet status + +- VAD Cache and Detection + + There are two issues in the VAD settings that can cause a delay in the first frame trigger of speech. + + 1. The inherent delay of the VAD algorithm itself. VAD cannot accurately trigger speech on the first frame and may delay by 1 to 3 frames. + 2. To avoid false triggers, the VAD is triggered when the continuous trigger duration reaches the `vad_min_speech_ms` parameter in AFE configuation. + + Due to the above two reasons, directly using the first frame trigger of VAD may cause the first word to be truncated. + To avoid this case, AFE V2.0 has added a VAD cache. You can determine whether a VAD cache needs to be saved by checking the vad_cache_size + + :: + + afe_fetch_result_t* result = afe_handle->fetch(afe_data); + if (result->vad_cache_size > 0) { + printf("vad cache size: %d\n", result->vad_cache_size); + fwrite(result->vad_cache, 1, result->vad_cache_size, fp); + } + + printf("vad state: %s\n", res->vad_state==VAD_SILENCE ? "noise" : "speech"); + + +Resource Occupancy +------------------ + +For the resource occupancy for this model, see :doc:`Resource Occupancy <../benchmark/README>`. \ No newline at end of file diff --git a/docs/en/wake_word_engine/ESP_Wake_Words_Customization.rst b/docs/en/wake_word_engine/ESP_Wake_Words_Customization.rst index fb91dbf..fd59d2b 100644 --- a/docs/en/wake_word_engine/ESP_Wake_Words_Customization.rst +++ b/docs/en/wake_word_engine/ESP_Wake_Words_Customization.rst @@ -10,7 +10,6 @@ Espressif provides users with the **wake word customization** : #. Espressif has already opened some wake words for customers' commercial use, such as "HI Leixi", or "Nihao Xiaoxin". - - For a complete list, see Table :ref:`Publicly Available Wake Words Provided by Espressif ` . - Espressif also plans to provide more wake words that are free for commercial use soon. #. Offline wake word customization can also be provided by Espressif: diff --git a/docs/en/wake_word_engine/README.rst b/docs/en/wake_word_engine/README.rst index 9979f2b..cf1fe34 100644 --- a/docs/en/wake_word_engine/README.rst +++ b/docs/en/wake_word_engine/README.rst @@ -46,32 +46,6 @@ Please see the flow diagram of WakeNet below: - Keyword Triggering Method: For continuous audio stream, we calculate the average recognition results (M) for several frames and generate a smoothing prediction result, to improve the accuracy of keyword triggering. Only when the M value is larger than the set threshold, a triggering command is sent. -The wake words supported by Espressif chips are listed below: - -.. _esp-open-wake-word: - -+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+ -| Chip | ESP32 | ESP32S3 | -+=================+===========+=============+=============+===========+===========+===========+===========+ -| model | WakeNet 5 | WakeNet 8 | WakeNet 9 | -| +-----------+-------------+-------------+-----------+-----------+-----------+-----------+ -| | WakeNet 5 | WakeNet 5X2 | WakeNet 5X3 | Q16 | Q8 | Q16 | Q8 | -+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+ -| Hi,Lexin | √ | √ | √ | | | | √ | -+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+ -| nihaoxiaozhi | √ | | √ | | | | √ | -+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+ -| nihaoxiaoxin | | | √ | | | | | -+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+ -| xiaoaitongxue | | | | | | | √ | -+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+ -| Alexa | | | | √ | | | √ | -+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+ -| Hi,ESP | | | | | | | √ | -+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+ -| Customized word | | | | | | | √ | -+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+ - Use WakeNet ----------- @@ -89,7 +63,7 @@ Use WakeNet :: - afe_config.wakeNet_init = False. + afe_config->wakeNet_init = False. If users want to enable/disable WakeNet temporarily, please use: diff --git a/docs/zh_CN/index.rst b/docs/zh_CN/index.rst index 6fb2d42..9edf5f0 100644 --- a/docs/zh_CN/index.rst +++ b/docs/zh_CN/index.rst @@ -17,6 +17,7 @@ ESP-SR 用户指南 入门指南 AFE 声学前端算法 语音唤醒 WakeNet + VAD vadnet 语音指令 MultiNet 语音合成(仅支持中文) 模型加载 diff --git a/docs/zh_CN/vadnet/readme.rst b/docs/zh_CN/vadnet/readme.rst new file mode 100644 index 0000000..6285515 --- /dev/null +++ b/docs/zh_CN/vadnet/readme.rst @@ -0,0 +1,67 @@ +语音活动检测模型 +============================== + +:link_to_translation:`en:[English]` + +VADNet 是一个基于神经网络的语音活动检测模型,专为低功耗嵌入式MCU设计。 + +概述 +-------- + +VADNet 采用了与 WakeNet 相似的模型结构和数据处理流程,更多实现细节可参考 :doc:`音频前端处理模块 <../audio_front_end/README>` 中的说明。 + +VADNet 训练数据包括了大约5000小时中文数据, 5000 小时英文数据,还有5000小时的多语言数据。 + +使用VADNet +----------- + +- 选择VADNet模型 + + 选择VADNet模型请参考 :doc:`模型烧录指南 <../flash_model/README>` 。 + +- 运行VADNet + + VADNet 当前集成在 :doc:`音频前端处理模块 <../audio_front_end/README>` 中,默认处于启用状态,通过AFE的fetch接口返回检测结果。 + + 常用VAD参数配置如下: + + :: + + afe_config->vad_init = true // 是否在AFE流水线中初始化VAD,默认启用 + afe_config->vad_min_noise_ms = 1000; // 噪声/静音段的最短持续时间(毫秒) + afe_config->vad_min_speech_ms = 128; // 语音段的最短持续时间(毫秒) + afe_config->vad_delay_ms = 128; // VAD首帧触发到语音首帧数据的延迟量 + afe_config->vad_mode = VAD_MODE_1; // 模式值越大,语音触发概率越高 + + 如需临时启用/禁用/重置VADNet,可使用以下接口: + + :: + + afe_handle->disable_vad(afe_data); // 禁用VAD + afe_handle->enable_vad(afe_data); // 启用VAD + afe_handle->reset_vad(afe_data); // 重置VAD状态 + +- VAD缓存与检测 + + VAD配置中的两个特性可能导致语音首帧触发延迟: + + 1. VAD算法固有延迟:VAD无法在首帧精准触发,可能有1-3帧延迟 + 2. 防误触机制:需持续触发时间达到配置参数`vad_min_speech_ms`才会正式触发 + + 为避免上述原因导致语音首字截断,AFE V2.0新增了VAD缓存机制。可通过检查vad_cache_size判断是否需要保存VAD缓存: + + :: + + afe_fetch_result_t* result = afe_handle->fetch(afe_data); + if (result->vad_cache_size > 0) { + printf("vad缓存大小: %d\n", result->vad_cache_size); + fwrite(result->vad_cache, 1, result->vad_cache_size, fp); // 写入缓存数据 + } + + printf("vad状态: %s\n", res->vad_state==VAD_SILENCE ? "环境噪声" : "语音活动"); + + +资源占用 +------------------ + +本模型的资源占用情况请参考 :doc:`资源占用说明 <../benchmark/README>`。 \ No newline at end of file diff --git a/docs/zh_CN/wake_word_engine/ESP_Wake_Words_Customization.rst b/docs/zh_CN/wake_word_engine/ESP_Wake_Words_Customization.rst index 931e551..f07df27 100644 --- a/docs/zh_CN/wake_word_engine/ESP_Wake_Words_Customization.rst +++ b/docs/zh_CN/wake_word_engine/ESP_Wake_Words_Customization.rst @@ -10,7 +10,6 @@ #. “HI乐鑫”,“你好小鑫” 等官方开放的唤醒词,客户可直接商用 - - 完整列表可见 :ref:`乐鑫免费商用唤醒词 ` - 同时,乐鑫会逐渐开放更多可免费商用的唤醒词 #. 除官方开放的唤醒词,乐鑫还可为客户提供 **唤醒词定制服务**,主要分如下两种情况: diff --git a/model/wakenet_model/wn9_xiaoyutongxue_tts/_MODEL_INFO_ b/model/wakenet_model/wn9_xiaoyutongxue_tts/_MODEL_INFO_ new file mode 100644 index 0000000..37dae7b --- /dev/null +++ b/model/wakenet_model/wn9_xiaoyutongxue_tts/_MODEL_INFO_ @@ -0,0 +1 @@ +wakenet9l_tts2h12_小宇同学_3_0.624_0.630 \ No newline at end of file diff --git a/model/wakenet_model/wn9_xiaoyutongxue_tts/wn9_data b/model/wakenet_model/wn9_xiaoyutongxue_tts/wn9_data new file mode 100644 index 0000000..8499ca9 Binary files /dev/null and b/model/wakenet_model/wn9_xiaoyutongxue_tts/wn9_data differ diff --git a/model/wakenet_model/wn9_xiaoyutongxue_tts/wn9_index b/model/wakenet_model/wn9_xiaoyutongxue_tts/wn9_index new file mode 100644 index 0000000..5e7c881 Binary files /dev/null and b/model/wakenet_model/wn9_xiaoyutongxue_tts/wn9_index differ diff --git a/test_apps/esp-sr/main/CMakeLists.txt b/test_apps/esp-sr/main/CMakeLists.txt index 08ad7bb..1d8dfd9 100644 --- a/test_apps/esp-sr/main/CMakeLists.txt +++ b/test_apps/esp-sr/main/CMakeLists.txt @@ -5,6 +5,7 @@ set(srcs "test_multinet.cpp" "test_afe.cpp" "test_mfcc.cpp" + "test_vadnet.cpp" ) idf_component_register(SRCS ${srcs} diff --git a/test_apps/esp-sr/main/test_vadnet.cpp b/test_apps/esp-sr/main/test_vadnet.cpp new file mode 100644 index 0000000..691a54a --- /dev/null +++ b/test_apps/esp-sr/main/test_vadnet.cpp @@ -0,0 +1,115 @@ +/* test_mean.c: Implementation of a testable component. + + This example code is in the Public Domain (or CC0 licensed, at your option.) + + Unless required by applicable law or agreed to in writing, this + software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR + CONDITIONS OF ANY KIND, either express or implied. +*/ + +#include "string.h" +#include +#include "unity.h" + +#include "model_path.h" +#include "esp_vadn_iface.h" +#include "esp_vadn_models.h" +#include "hilexin.h" +#include "hiesp.h" +#include "dl_lib_convq_queue.h" +#include + +TEST_CASE("vadnet create/destroy API & memory leak", "[wn]") +{ + vTaskDelay(500 / portTICK_PERIOD_MS); + int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + int start_internal_size = heap_caps_get_free_size(MALLOC_CAP_INTERNAL); + srmodel_list_t *models = esp_srmodel_init("model"); + char *model_name = esp_srmodel_filter(models, ESP_VADN_PREFIX, NULL); + esp_vadn_iface_t *vadnet = (esp_vadn_iface_t*)esp_vadn_handle_from_name(model_name); + + // test model loading time + struct timeval tv_start, tv_end; + gettimeofday(&tv_start, NULL); + model_iface_data_t *model_data = vadnet->create(model_name, VAD_MODE_0, 1, 32, 64); + gettimeofday(&tv_end, NULL); + int tv_ms = (tv_end.tv_sec - tv_start.tv_sec) * 1000 + (tv_end.tv_usec - tv_start.tv_usec) / 1000; + printf("create latency:%d ms\n", tv_ms); + + // test model memory concumption + int create_size = start_size - heap_caps_get_free_size(MALLOC_CAP_8BIT); + int create_internal_size = start_internal_size - heap_caps_get_free_size(MALLOC_CAP_INTERNAL); + printf("Internal RAM: %d, PSRAM:%d\n", create_internal_size, create_size - create_internal_size); + vadnet->destroy(model_data); + esp_srmodel_deinit(models); + + // test memory leak + int first_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + int last_end_size = first_end_size; + int mem_leak = start_size - last_end_size; + printf("create&destroy times:%d, memory leak:%d\n", 1, mem_leak); + + for (int i = 0; i < 6; i++) { + printf("init partition ...\n"); + models = esp_srmodel_init("model"); + model_name = esp_srmodel_filter(models, ESP_VADN_PREFIX, NULL); + vadnet = (esp_vadn_iface_t*)esp_vadn_handle_from_name(model_name); + // char *wake_words = esp_srmodel_get_wake_words(models, model_name); + + printf("create ...\n"); + model_data = vadnet->create(model_name, VAD_MODE_0, 1, 32, 64); + + printf("destroy ...\n"); + vadnet->destroy(model_data); + // free(wake_words); + esp_srmodel_deinit(models); + + last_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + mem_leak = start_size - last_end_size; + printf("create&destroy times:%d, memory leak:%d\n", i + 2, mem_leak); + } + + TEST_ASSERT_EQUAL(true, (mem_leak) < 1000 && last_end_size == first_end_size); +} + +TEST_CASE("vadnet detect API & cpu loading", "[wn]") +{ + vTaskDelay(500 / portTICK_PERIOD_MS); + srmodel_list_t *models = esp_srmodel_init("model"); + char *model_name = esp_srmodel_filter(models, ESP_VADN_PREFIX, NULL); + esp_vadn_iface_t *vadnet = (esp_vadn_iface_t*)esp_vadn_handle_from_name(model_name); + model_iface_data_t *model_data = vadnet->create(model_name, VAD_MODE_0, 1, 32, 64); + int frequency = vadnet->get_samp_rate(model_data); + int audio_chunksize = vadnet->get_samp_chunksize(model_data) * sizeof(int16_t); + int16_t *buffer = (int16_t *) malloc(audio_chunksize); + int chunks = 0; + int detected = 0; + struct timeval tv_start, tv_end; + gettimeofday(&tv_start, NULL); + unsigned char* data = (unsigned char*)hilexin; + size_t data_size = sizeof(hilexin); + + while (1) { + if ((chunks + 1)*audio_chunksize <= data_size) { + memcpy(buffer, data + chunks * audio_chunksize, audio_chunksize); + } else { + break; + } + vad_state_t res = vadnet->detect(model_data, buffer); + if (res == VAD_SPEECH) { + detected += 1; + } + + chunks++; + } + gettimeofday(&tv_end, NULL); + int tv_ms = (tv_end.tv_sec - tv_start.tv_sec) * 1000 + (tv_end.tv_usec - tv_start.tv_usec) / 1000; + int run_ms = (chunks) * audio_chunksize / sizeof(int16_t) * 1000 / frequency; + float cpu_loading = tv_ms * 100.0 / run_ms; + printf("Done! Took %d ms to parse %d ms worth of samples in %d iterations. CPU loading(single core):%.1f%%\n", + tv_ms, run_ms, chunks, cpu_loading); + + vadnet->destroy(model_data); + esp_srmodel_deinit(models); + TEST_ASSERT_EQUAL(true, (cpu_loading < 50 && detected > 35)); +} diff --git a/test_apps/esp-sr/sdkconfig.ci.p4_wn9_hilexin b/test_apps/esp-sr/sdkconfig.ci.p4_wn9_hilexin index d9f287e..ca9fd48 100644 --- a/test_apps/esp-sr/sdkconfig.ci.p4_wn9_hilexin +++ b/test_apps/esp-sr/sdkconfig.ci.p4_wn9_hilexin @@ -5,6 +5,7 @@ CONFIG_IDF_TARGET="esp32p4" CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y +CONFIG_SR_VADN_VADNET1_MEDIUM=y CONFIG_SR_WN_WN9_HILEXIN=y CONFIG_SPIRAM=y CONFIG_ESP_TASK_WDT_EN=n diff --git a/test_apps/esp-sr/sdkconfig.ci.wn9_hilexin b/test_apps/esp-sr/sdkconfig.ci.wn9_hilexin index d4174c9..00fe3c3 100644 --- a/test_apps/esp-sr/sdkconfig.ci.wn9_hilexin +++ b/test_apps/esp-sr/sdkconfig.ci.wn9_hilexin @@ -5,6 +5,7 @@ CONFIG_IDF_TARGET="esp32s3" CONFIG_ESPTOOLPY_FLASHMODE_QIO=y CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y CONFIG_PARTITION_TABLE_CUSTOM=y +CONFIG_SR_VADN_VADNET1_MEDIUM=y CONFIG_SR_WN_WN9_HILEXIN=y CONFIG_SPIRAM=y CONFIG_ESP_TASK_WDT_EN=n