mirror of
https://github.com/espressif/esp-sr.git
synced 2025-09-15 15:28:44 +08:00
Merge branch 'docs/vad' into 'master'
Docs/vad See merge request speech-recognition-framework/esp-sr!137
This commit is contained in:
commit
d7fc5934be
@ -188,6 +188,10 @@ menu "Load Multiple Wake Words"
|
||||
config SR_WN_WN9_XIAOSUROU_TTS2
|
||||
bool "小酥肉 (wn9_xiaosurou_tts2)"
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_XIAOYUTONGXUE_TTS2
|
||||
bool "小宇同学 (wn9_xiaoyutongxue_tts2)"
|
||||
default False
|
||||
endmenu
|
||||
|
||||
|
||||
|
||||
@ -12,6 +12,7 @@ ESP-SR framework includes the following modules:
|
||||
|
||||
* [Audio Front-end AFE](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/audio_front_end/README.html)
|
||||
* [Wake Word Engine WakeNet](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/wake_word_engine/README.html)
|
||||
* [VAD VADNet](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/vadnet/README.html)
|
||||
* [Speech Command Word Recognition MultiNet](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/speech_command_recognition/README.html)
|
||||
* [Speech Synthesis](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/speech_synthesis/readme.html)
|
||||
|
||||
@ -61,6 +62,7 @@ The following wake words are supported in esp-sr:
|
||||
|小鸭小鸭 | | wn9_xiaoyaxiaoya_tts2 |
|
||||
|璃奈板 | | wn9_linaiban_tts2 |
|
||||
|小酥肉 | | wn9_xiaosurou_tts2 |
|
||||
|小宇同学 | | wn9_小宇同学_tts2 |
|
||||
|
||||
*NOTE:* `_tts` suffix means this WakeNet model is trained by TTS samples. `_tts2` suffix means this WakeNet model is trained by TTS Pipeline V2.
|
||||
|
||||
@ -83,6 +85,7 @@ Espressif Audio Front-End **AFE** integrates AEC (Acoustic Echo Cancellation), V
|
||||
|
||||
Our two-mic Audio Front-End (AFE) have been qualified as a “Software Audio Front-End Solution” for [Amazon Alexa Built-in devices](https://developer.amazon.com/en-US/alexa/solution-providers/dev-kits#software-audio-front-end-dev-kits).
|
||||
|
||||
Now AFE V2.0 has been released, which is more efficient than AFE V1.0. and supports more models.
|
||||
|
||||
**In order to achieve optimal performance:**
|
||||
|
||||
|
||||
@ -16,6 +16,7 @@ ESP-SR User Guide
|
||||
Getting Started <getting_started/readme>
|
||||
Audio Front-end (AFE) <audio_front_end/index>
|
||||
Wake Word WakeNet <wake_word_engine/index>
|
||||
VAD Model vadnet <vadnet/readme>
|
||||
Speech Command Word MultiNet <speech_command_recognition/README>
|
||||
Speech Synthesis (Only Supports Chinese Language) <speech_synthesis/readme>
|
||||
Flashing Models <flash_model/README>
|
||||
|
||||
69
docs/en/vadnet/readme.rst
Normal file
69
docs/en/vadnet/readme.rst
Normal file
@ -0,0 +1,69 @@
|
||||
Voice Activaty Detection Model
|
||||
==============================
|
||||
|
||||
:link_to_translation:`zh_CN:[中文]`
|
||||
|
||||
VADNet is a Voice Activaty Detection model built upon neural network for low-power embedded MCUs.
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
VADNet uses a model structure and data processing flow similar to WakeNet, for more details, you can refer to :doc:`AFE <../wake_word_engine/README>`
|
||||
|
||||
VADNet is trained by about 5,000 hours of Chinese data, 5,000 hours of English data, and 5,000 hours of multilingual data.
|
||||
|
||||
|
||||
Use VADNet
|
||||
-----------
|
||||
|
||||
- Select VADNet model
|
||||
|
||||
To select VADNet model, please refer to Section :doc:`Flashing Models <../flash_model/README>` .
|
||||
|
||||
- Run VADNet
|
||||
|
||||
VADNet is currently included in the :doc:`AFE <../audio_front_end/README>`, which is enabled by default, and returns the detection results through the AFE fetch interface.
|
||||
|
||||
The common vad setting is as follows:
|
||||
|
||||
::
|
||||
|
||||
afe_config->vad_init = true // Whether to initial vad in AFE pipeline. Default is true.
|
||||
afe_config->vad_min_noise_ms = 1000; // The minimum duration of noise or silence in ms.
|
||||
afe_config->vad_min_speech_ms = 128; // The minimum duration of speech in ms.
|
||||
afe_config->vad_delay_ms = 128; // The delay between the first frame trigger of VAD and the first frame of speech data.
|
||||
afe_config->vad_mode = VAD_MODE_1; // The larger the mode, the higher the speech trigger probability.
|
||||
|
||||
If users want to enable/disable/reset VADNet temporarily, please use:
|
||||
|
||||
::
|
||||
|
||||
afe_handle->disable_vad(afe_data); // disable VADNet
|
||||
afe_handle->enable_vad(afe_data); // enable VADNet
|
||||
afe_handle->reset_vad(afe_data); // reset VADNet status
|
||||
|
||||
- VAD Cache and Detection
|
||||
|
||||
There are two issues in the VAD settings that can cause a delay in the first frame trigger of speech.
|
||||
|
||||
1. The inherent delay of the VAD algorithm itself. VAD cannot accurately trigger speech on the first frame and may delay by 1 to 3 frames.
|
||||
2. To avoid false triggers, the VAD is triggered when the continuous trigger duration reaches the `vad_min_speech_ms` parameter in AFE configuation.
|
||||
|
||||
Due to the above two reasons, directly using the first frame trigger of VAD may cause the first word to be truncated.
|
||||
To avoid this case, AFE V2.0 has added a VAD cache. You can determine whether a VAD cache needs to be saved by checking the vad_cache_size
|
||||
|
||||
::
|
||||
|
||||
afe_fetch_result_t* result = afe_handle->fetch(afe_data);
|
||||
if (result->vad_cache_size > 0) {
|
||||
printf("vad cache size: %d\n", result->vad_cache_size);
|
||||
fwrite(result->vad_cache, 1, result->vad_cache_size, fp);
|
||||
}
|
||||
|
||||
printf("vad state: %s\n", res->vad_state==VAD_SILENCE ? "noise" : "speech");
|
||||
|
||||
|
||||
Resource Occupancy
|
||||
------------------
|
||||
|
||||
For the resource occupancy for this model, see :doc:`Resource Occupancy <../benchmark/README>`.
|
||||
@ -10,7 +10,6 @@ Espressif provides users with the **wake word customization** :
|
||||
|
||||
#. Espressif has already opened some wake words for customers' commercial use, such as "HI Leixi", or "Nihao Xiaoxin".
|
||||
|
||||
- For a complete list, see Table :ref:`Publicly Available Wake Words Provided by Espressif <esp-open-wake-word>` .
|
||||
- Espressif also plans to provide more wake words that are free for commercial use soon.
|
||||
|
||||
#. Offline wake word customization can also be provided by Espressif:
|
||||
|
||||
@ -46,32 +46,6 @@ Please see the flow diagram of WakeNet below:
|
||||
- Keyword Triggering Method:
|
||||
For continuous audio stream, we calculate the average recognition results (M) for several frames and generate a smoothing prediction result, to improve the accuracy of keyword triggering. Only when the M value is larger than the set threshold, a triggering command is sent.
|
||||
|
||||
The wake words supported by Espressif chips are listed below:
|
||||
|
||||
.. _esp-open-wake-word:
|
||||
|
||||
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
|
||||
| Chip | ESP32 | ESP32S3 |
|
||||
+=================+===========+=============+=============+===========+===========+===========+===========+
|
||||
| model | WakeNet 5 | WakeNet 8 | WakeNet 9 |
|
||||
| +-----------+-------------+-------------+-----------+-----------+-----------+-----------+
|
||||
| | WakeNet 5 | WakeNet 5X2 | WakeNet 5X3 | Q16 | Q8 | Q16 | Q8 |
|
||||
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
|
||||
| Hi,Lexin | √ | √ | √ | | | | √ |
|
||||
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
|
||||
| nihaoxiaozhi | √ | | √ | | | | √ |
|
||||
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
|
||||
| nihaoxiaoxin | | | √ | | | | |
|
||||
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
|
||||
| xiaoaitongxue | | | | | | | √ |
|
||||
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
|
||||
| Alexa | | | | √ | | | √ |
|
||||
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
|
||||
| Hi,ESP | | | | | | | √ |
|
||||
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
|
||||
| Customized word | | | | | | | √ |
|
||||
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
|
||||
|
||||
Use WakeNet
|
||||
-----------
|
||||
|
||||
@ -89,7 +63,7 @@ Use WakeNet
|
||||
|
||||
::
|
||||
|
||||
afe_config.wakeNet_init = False.
|
||||
afe_config->wakeNet_init = False.
|
||||
|
||||
If users want to enable/disable WakeNet temporarily, please use:
|
||||
|
||||
|
||||
@ -17,6 +17,7 @@ ESP-SR 用户指南
|
||||
入门指南 <getting_started/readme>
|
||||
AFE 声学前端算法 <audio_front_end/index>
|
||||
语音唤醒 WakeNet <wake_word_engine/index>
|
||||
VAD vadnet <vadnet/readme>
|
||||
语音指令 MultiNet <speech_command_recognition/README>
|
||||
语音合成(仅支持中文)<speech_synthesis/readme>
|
||||
模型加载 <flash_model/README>
|
||||
|
||||
67
docs/zh_CN/vadnet/readme.rst
Normal file
67
docs/zh_CN/vadnet/readme.rst
Normal file
@ -0,0 +1,67 @@
|
||||
语音活动检测模型
|
||||
==============================
|
||||
|
||||
:link_to_translation:`en:[English]`
|
||||
|
||||
VADNet 是一个基于神经网络的语音活动检测模型,专为低功耗嵌入式MCU设计。
|
||||
|
||||
概述
|
||||
--------
|
||||
|
||||
VADNet 采用了与 WakeNet 相似的模型结构和数据处理流程,更多实现细节可参考 :doc:`音频前端处理模块 <../audio_front_end/README>` 中的说明。
|
||||
|
||||
VADNet 训练数据包括了大约5000小时中文数据, 5000 小时英文数据,还有5000小时的多语言数据。
|
||||
|
||||
使用VADNet
|
||||
-----------
|
||||
|
||||
- 选择VADNet模型
|
||||
|
||||
选择VADNet模型请参考 :doc:`模型烧录指南 <../flash_model/README>` 。
|
||||
|
||||
- 运行VADNet
|
||||
|
||||
VADNet 当前集成在 :doc:`音频前端处理模块 <../audio_front_end/README>` 中,默认处于启用状态,通过AFE的fetch接口返回检测结果。
|
||||
|
||||
常用VAD参数配置如下:
|
||||
|
||||
::
|
||||
|
||||
afe_config->vad_init = true // 是否在AFE流水线中初始化VAD,默认启用
|
||||
afe_config->vad_min_noise_ms = 1000; // 噪声/静音段的最短持续时间(毫秒)
|
||||
afe_config->vad_min_speech_ms = 128; // 语音段的最短持续时间(毫秒)
|
||||
afe_config->vad_delay_ms = 128; // VAD首帧触发到语音首帧数据的延迟量
|
||||
afe_config->vad_mode = VAD_MODE_1; // 模式值越大,语音触发概率越高
|
||||
|
||||
如需临时启用/禁用/重置VADNet,可使用以下接口:
|
||||
|
||||
::
|
||||
|
||||
afe_handle->disable_vad(afe_data); // 禁用VAD
|
||||
afe_handle->enable_vad(afe_data); // 启用VAD
|
||||
afe_handle->reset_vad(afe_data); // 重置VAD状态
|
||||
|
||||
- VAD缓存与检测
|
||||
|
||||
VAD配置中的两个特性可能导致语音首帧触发延迟:
|
||||
|
||||
1. VAD算法固有延迟:VAD无法在首帧精准触发,可能有1-3帧延迟
|
||||
2. 防误触机制:需持续触发时间达到配置参数`vad_min_speech_ms`才会正式触发
|
||||
|
||||
为避免上述原因导致语音首字截断,AFE V2.0新增了VAD缓存机制。可通过检查vad_cache_size判断是否需要保存VAD缓存:
|
||||
|
||||
::
|
||||
|
||||
afe_fetch_result_t* result = afe_handle->fetch(afe_data);
|
||||
if (result->vad_cache_size > 0) {
|
||||
printf("vad缓存大小: %d\n", result->vad_cache_size);
|
||||
fwrite(result->vad_cache, 1, result->vad_cache_size, fp); // 写入缓存数据
|
||||
}
|
||||
|
||||
printf("vad状态: %s\n", res->vad_state==VAD_SILENCE ? "环境噪声" : "语音活动");
|
||||
|
||||
|
||||
资源占用
|
||||
------------------
|
||||
|
||||
本模型的资源占用情况请参考 :doc:`资源占用说明 <../benchmark/README>`。
|
||||
@ -10,7 +10,6 @@
|
||||
|
||||
#. “HI乐鑫”,“你好小鑫” 等官方开放的唤醒词,客户可直接商用
|
||||
|
||||
- 完整列表可见 :ref:`乐鑫免费商用唤醒词 <esp-open-wake-word>`
|
||||
- 同时,乐鑫会逐渐开放更多可免费商用的唤醒词
|
||||
|
||||
#. 除官方开放的唤醒词,乐鑫还可为客户提供 **唤醒词定制服务**,主要分如下两种情况:
|
||||
|
||||
1
model/wakenet_model/wn9_xiaoyutongxue_tts/_MODEL_INFO_
Normal file
1
model/wakenet_model/wn9_xiaoyutongxue_tts/_MODEL_INFO_
Normal file
@ -0,0 +1 @@
|
||||
wakenet9l_tts2h12_小宇同学_3_0.624_0.630
|
||||
BIN
model/wakenet_model/wn9_xiaoyutongxue_tts/wn9_data
Normal file
BIN
model/wakenet_model/wn9_xiaoyutongxue_tts/wn9_data
Normal file
Binary file not shown.
BIN
model/wakenet_model/wn9_xiaoyutongxue_tts/wn9_index
Normal file
BIN
model/wakenet_model/wn9_xiaoyutongxue_tts/wn9_index
Normal file
Binary file not shown.
@ -5,6 +5,7 @@ set(srcs
|
||||
"test_multinet.cpp"
|
||||
"test_afe.cpp"
|
||||
"test_mfcc.cpp"
|
||||
"test_vadnet.cpp"
|
||||
)
|
||||
|
||||
idf_component_register(SRCS ${srcs}
|
||||
|
||||
115
test_apps/esp-sr/main/test_vadnet.cpp
Normal file
115
test_apps/esp-sr/main/test_vadnet.cpp
Normal file
@ -0,0 +1,115 @@
|
||||
/* test_mean.c: Implementation of a testable component.
|
||||
|
||||
This example code is in the Public Domain (or CC0 licensed, at your option.)
|
||||
|
||||
Unless required by applicable law or agreed to in writing, this
|
||||
software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
|
||||
CONDITIONS OF ANY KIND, either express or implied.
|
||||
*/
|
||||
|
||||
#include "string.h"
|
||||
#include <limits.h>
|
||||
#include "unity.h"
|
||||
|
||||
#include "model_path.h"
|
||||
#include "esp_vadn_iface.h"
|
||||
#include "esp_vadn_models.h"
|
||||
#include "hilexin.h"
|
||||
#include "hiesp.h"
|
||||
#include "dl_lib_convq_queue.h"
|
||||
#include <sys/time.h>
|
||||
|
||||
TEST_CASE("vadnet create/destroy API & memory leak", "[wn]")
|
||||
{
|
||||
vTaskDelay(500 / portTICK_PERIOD_MS);
|
||||
int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
|
||||
int start_internal_size = heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
|
||||
srmodel_list_t *models = esp_srmodel_init("model");
|
||||
char *model_name = esp_srmodel_filter(models, ESP_VADN_PREFIX, NULL);
|
||||
esp_vadn_iface_t *vadnet = (esp_vadn_iface_t*)esp_vadn_handle_from_name(model_name);
|
||||
|
||||
// test model loading time
|
||||
struct timeval tv_start, tv_end;
|
||||
gettimeofday(&tv_start, NULL);
|
||||
model_iface_data_t *model_data = vadnet->create(model_name, VAD_MODE_0, 1, 32, 64);
|
||||
gettimeofday(&tv_end, NULL);
|
||||
int tv_ms = (tv_end.tv_sec - tv_start.tv_sec) * 1000 + (tv_end.tv_usec - tv_start.tv_usec) / 1000;
|
||||
printf("create latency:%d ms\n", tv_ms);
|
||||
|
||||
// test model memory concumption
|
||||
int create_size = start_size - heap_caps_get_free_size(MALLOC_CAP_8BIT);
|
||||
int create_internal_size = start_internal_size - heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
|
||||
printf("Internal RAM: %d, PSRAM:%d\n", create_internal_size, create_size - create_internal_size);
|
||||
vadnet->destroy(model_data);
|
||||
esp_srmodel_deinit(models);
|
||||
|
||||
// test memory leak
|
||||
int first_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
|
||||
int last_end_size = first_end_size;
|
||||
int mem_leak = start_size - last_end_size;
|
||||
printf("create&destroy times:%d, memory leak:%d\n", 1, mem_leak);
|
||||
|
||||
for (int i = 0; i < 6; i++) {
|
||||
printf("init partition ...\n");
|
||||
models = esp_srmodel_init("model");
|
||||
model_name = esp_srmodel_filter(models, ESP_VADN_PREFIX, NULL);
|
||||
vadnet = (esp_vadn_iface_t*)esp_vadn_handle_from_name(model_name);
|
||||
// char *wake_words = esp_srmodel_get_wake_words(models, model_name);
|
||||
|
||||
printf("create ...\n");
|
||||
model_data = vadnet->create(model_name, VAD_MODE_0, 1, 32, 64);
|
||||
|
||||
printf("destroy ...\n");
|
||||
vadnet->destroy(model_data);
|
||||
// free(wake_words);
|
||||
esp_srmodel_deinit(models);
|
||||
|
||||
last_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
|
||||
mem_leak = start_size - last_end_size;
|
||||
printf("create&destroy times:%d, memory leak:%d\n", i + 2, mem_leak);
|
||||
}
|
||||
|
||||
TEST_ASSERT_EQUAL(true, (mem_leak) < 1000 && last_end_size == first_end_size);
|
||||
}
|
||||
|
||||
TEST_CASE("vadnet detect API & cpu loading", "[wn]")
|
||||
{
|
||||
vTaskDelay(500 / portTICK_PERIOD_MS);
|
||||
srmodel_list_t *models = esp_srmodel_init("model");
|
||||
char *model_name = esp_srmodel_filter(models, ESP_VADN_PREFIX, NULL);
|
||||
esp_vadn_iface_t *vadnet = (esp_vadn_iface_t*)esp_vadn_handle_from_name(model_name);
|
||||
model_iface_data_t *model_data = vadnet->create(model_name, VAD_MODE_0, 1, 32, 64);
|
||||
int frequency = vadnet->get_samp_rate(model_data);
|
||||
int audio_chunksize = vadnet->get_samp_chunksize(model_data) * sizeof(int16_t);
|
||||
int16_t *buffer = (int16_t *) malloc(audio_chunksize);
|
||||
int chunks = 0;
|
||||
int detected = 0;
|
||||
struct timeval tv_start, tv_end;
|
||||
gettimeofday(&tv_start, NULL);
|
||||
unsigned char* data = (unsigned char*)hilexin;
|
||||
size_t data_size = sizeof(hilexin);
|
||||
|
||||
while (1) {
|
||||
if ((chunks + 1)*audio_chunksize <= data_size) {
|
||||
memcpy(buffer, data + chunks * audio_chunksize, audio_chunksize);
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
vad_state_t res = vadnet->detect(model_data, buffer);
|
||||
if (res == VAD_SPEECH) {
|
||||
detected += 1;
|
||||
}
|
||||
|
||||
chunks++;
|
||||
}
|
||||
gettimeofday(&tv_end, NULL);
|
||||
int tv_ms = (tv_end.tv_sec - tv_start.tv_sec) * 1000 + (tv_end.tv_usec - tv_start.tv_usec) / 1000;
|
||||
int run_ms = (chunks) * audio_chunksize / sizeof(int16_t) * 1000 / frequency;
|
||||
float cpu_loading = tv_ms * 100.0 / run_ms;
|
||||
printf("Done! Took %d ms to parse %d ms worth of samples in %d iterations. CPU loading(single core):%.1f%%\n",
|
||||
tv_ms, run_ms, chunks, cpu_loading);
|
||||
|
||||
vadnet->destroy(model_data);
|
||||
esp_srmodel_deinit(models);
|
||||
TEST_ASSERT_EQUAL(true, (cpu_loading < 50 && detected > 35));
|
||||
}
|
||||
@ -5,6 +5,7 @@ CONFIG_IDF_TARGET="esp32p4"
|
||||
CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
|
||||
CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
|
||||
CONFIG_PARTITION_TABLE_CUSTOM=y
|
||||
CONFIG_SR_VADN_VADNET1_MEDIUM=y
|
||||
CONFIG_SR_WN_WN9_HILEXIN=y
|
||||
CONFIG_SPIRAM=y
|
||||
CONFIG_ESP_TASK_WDT_EN=n
|
||||
|
||||
@ -5,6 +5,7 @@ CONFIG_IDF_TARGET="esp32s3"
|
||||
CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
|
||||
CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
|
||||
CONFIG_PARTITION_TABLE_CUSTOM=y
|
||||
CONFIG_SR_VADN_VADNET1_MEDIUM=y
|
||||
CONFIG_SR_WN_WN9_HILEXIN=y
|
||||
CONFIG_SPIRAM=y
|
||||
CONFIG_ESP_TASK_WDT_EN=n
|
||||
|
||||
Loading…
Reference in New Issue
Block a user