mirror of
https://github.com/espressif/esp-sr.git
synced 2025-09-15 15:28:44 +08:00
Merge branch 'feature/wakeNet6' into 'master'
Feature/wakenet6 See merge request speech-recognition-internal/esp_sr_public!8
This commit is contained in:
commit
b8c8606819
@ -1,5 +1,9 @@
|
||||
# Change log for esp-sr
|
||||
|
||||
## 0.3.0(dev)
|
||||
add wakenet6
|
||||
support cmake
|
||||
|
||||
## 0.2.0
|
||||
add acoustic algorithm, include AEC, AGC, VAD ,NS
|
||||
add wakenet5X2 and wakenet5X3
|
||||
|
||||
@ -54,11 +54,15 @@ config SR_WN5X2_NIHAOXIAOZHI
|
||||
bool "nihaoxiaozhi (WakeNet5X2)"
|
||||
depends on SR_MODEL_WN5_QUANT || SR_MODEL_WN5_FLOAT
|
||||
|
||||
config SR_WN5X3_NIHAOXIAOXIN
|
||||
bool "nihaoxiaoxin (WakeNet5X3)"
|
||||
depends on SR_MODEL_WN5_QUANT || SR_MODEL_WN5_FLOAT
|
||||
|
||||
config SR_WN5_CUSTOMIZED_WORD
|
||||
bool "customized word (WakeNet5)"
|
||||
depends on SR_MODEL_WN5_QUANT || SR_MODEL_WN5_FLOAT
|
||||
|
||||
config SR_WN6_HILEXIN
|
||||
config SR_WN6_NIHAOXIAOXIN
|
||||
bool "nihaoxiaoxin (WakeNet6)"
|
||||
depends on SR_MODEL_WN6_QUANT
|
||||
|
||||
@ -68,32 +72,57 @@ config SR_WN6_CUSTOMIZED_WORD
|
||||
|
||||
endchoice
|
||||
|
||||
choice SR_MN_MODEL_SEL
|
||||
prompt "speech commands recognition model to use"
|
||||
default CONFIG_MN1_MODEL_FLOAT
|
||||
help
|
||||
Select the model to be used.
|
||||
choice SR_RUN_WN6_CORE
|
||||
|
||||
config SR_MN1_MODEL_QUANT
|
||||
bool "MultiNet 1 (quantized)"
|
||||
depends on SR_MODEL_WN6_QUANT || SR_MODEL_WN6_FLOAT
|
||||
|
||||
prompt "ESP32 core to run WakeNet6"
|
||||
default SR_RUN_WM6_CORE1
|
||||
help
|
||||
Select one ESP32 core to run WakeNet6.
|
||||
|
||||
config SR_RUN_WN6_CORE0
|
||||
bool "core 0"
|
||||
|
||||
config SR_RUN_WN6_CORE1
|
||||
bool "core 1"
|
||||
|
||||
endchoice
|
||||
|
||||
choice SR_MN_MODEL_SEL
|
||||
prompt "speech commands recognition model to use"
|
||||
default CONFIG_MN1_MODEL_FLOAT
|
||||
help
|
||||
Select the model to be used.
|
||||
|
||||
config SR_MN1_MODEL_FLOAT
|
||||
bool "MultiNet 1 (float)"
|
||||
|
||||
config SR_MN1_MODEL_QUANT
|
||||
bool "MultiNet 1 (quantized)"
|
||||
|
||||
endchoice
|
||||
|
||||
|
||||
choice SR_LANGUAGE_SEL
|
||||
prompt "langugae"
|
||||
default SR_MN1_CHINESE
|
||||
help
|
||||
Select the language to be used.
|
||||
prompt "langugae"
|
||||
default SR_MN1_CHINESE
|
||||
help
|
||||
Select the language to be used.
|
||||
|
||||
config SR_MN1_CHINESE_FLOAT
|
||||
bool "chinese (MultiNet1)"
|
||||
depends on SR_MN1_MODEL_FLOAT
|
||||
|
||||
config SR_MN1_CHINESE_QUANT
|
||||
bool "chinese (MultiNet1)"
|
||||
depends on SR_MN1_MODEL_QUANT
|
||||
bool "chinese (MultiNet1)"
|
||||
depends on SR_MN1_MODEL_QUANT
|
||||
|
||||
endchoice
|
||||
|
||||
config SPEECH_COMMANDS_NUM
|
||||
int "The number of speech commands"
|
||||
default 0
|
||||
default 20
|
||||
help
|
||||
The number of the speech commands.
|
||||
|
||||
@ -101,83 +130,83 @@ menu "Add speech commands"
|
||||
|
||||
config SPEECH_COMMAND_ID0
|
||||
string "ID0"
|
||||
default ""
|
||||
default "da kai kong tiao"
|
||||
|
||||
config SPEECH_COMMAND_ID1
|
||||
string "ID1"
|
||||
default ""
|
||||
default "guan bi kong tiao"
|
||||
|
||||
config SPEECH_COMMAND_ID2
|
||||
string "ID2"
|
||||
default ""
|
||||
default "zeng da feng su"
|
||||
|
||||
config SPEECH_COMMAND_ID3
|
||||
string "ID3"
|
||||
default ""
|
||||
default "jian xiao feng su"
|
||||
|
||||
config SPEECH_COMMAND_ID4
|
||||
string "ID4"
|
||||
default ""
|
||||
default "sheng gao yi du"
|
||||
|
||||
config SPEECH_COMMAND_ID5
|
||||
string "ID5"
|
||||
default ""
|
||||
default "jiang di yi du"
|
||||
|
||||
config SPEECH_COMMAND_ID6
|
||||
string "ID6"
|
||||
default ""
|
||||
default "zhi re mo shi"
|
||||
|
||||
config SPEECH_COMMAND_ID7
|
||||
string "ID7"
|
||||
default ""
|
||||
default "zhi leng mo shi"
|
||||
|
||||
config SPEECH_COMMAND_ID8
|
||||
string "ID8"
|
||||
default ""
|
||||
default "song feng mo shi"
|
||||
|
||||
config SPEECH_COMMAND_ID9
|
||||
string "ID9"
|
||||
default ""
|
||||
default "jie neng mo shi"
|
||||
|
||||
config SPEECH_COMMAND_ID10
|
||||
string "ID10"
|
||||
default ""
|
||||
default "guan bi jie neng mo shi"
|
||||
|
||||
config SPEECH_COMMAND_ID11
|
||||
string "ID11"
|
||||
default ""
|
||||
default "chu shi mo shi"
|
||||
|
||||
config SPEECH_COMMAND_ID12
|
||||
string "ID12"
|
||||
default ""
|
||||
default "guan bi chu shi mo shi"
|
||||
|
||||
config SPEECH_COMMAND_ID13
|
||||
string "ID13"
|
||||
default ""
|
||||
default "da kai lan ya"
|
||||
|
||||
config SPEECH_COMMAND_ID14
|
||||
string "ID14"
|
||||
default ""
|
||||
default "guan bi lan ya"
|
||||
|
||||
config SPEECH_COMMAND_ID15
|
||||
string "ID15"
|
||||
default ""
|
||||
default "bo fang ge qu"
|
||||
|
||||
config SPEECH_COMMAND_ID16
|
||||
string "ID16"
|
||||
default ""
|
||||
default "zan ting bo fang"
|
||||
|
||||
config SPEECH_COMMAND_ID17
|
||||
string "ID17"
|
||||
default ""
|
||||
default "ding shi yi xiao shi"
|
||||
|
||||
config SPEECH_COMMAND_ID18
|
||||
string "ID18"
|
||||
default ""
|
||||
default "da kai dian deng"
|
||||
|
||||
config SPEECH_COMMAND_ID19
|
||||
string "ID19"
|
||||
default ""
|
||||
default "guan bi dian deng"
|
||||
|
||||
config SPEECH_COMMAND_ID20
|
||||
string "ID20"
|
||||
|
||||
@ -16,6 +16,7 @@
|
||||
|
||||
|
||||
#include "dl_lib_matrixq.h"
|
||||
#include "dl_lib_conv_queue.h"
|
||||
|
||||
//fixed-point convolution FIFO queue.
|
||||
typedef struct {
|
||||
@ -135,6 +136,8 @@ fptp_t * dl_softmax_step_q(dl_convq_queue_t *cq, int offset, fptp_t *out);
|
||||
*/
|
||||
qtp_t *dl_atrous_conv1dq(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
|
||||
dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int shift);
|
||||
qtp_t *dl_atrous_conv1dq_steps(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
|
||||
dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int shift, int offset);
|
||||
/**
|
||||
* @brief Fast implement of dilation layer as follows
|
||||
*
|
||||
@ -166,4 +169,6 @@ dl_matrix2dq_t *dl_basic_lstm_layer1_q(const dl_convq_queue_t *in, dl_matrix2dq_
|
||||
const dl_matrix2dq_t *weight, const dl_matrix2dq_t *bias, int step, int shift);
|
||||
void test_atrous_convq(int size, int rate, int in_channel, int out_channel);
|
||||
|
||||
dl_conv_queue_t *dl_convq_queue_add(dl_convq_queue_t *cq1, dl_convq_queue_t *cq2);
|
||||
|
||||
#endif
|
||||
Binary file not shown.
BIN
lib/libdl_lib.a
BIN
lib/libdl_lib.a
Binary file not shown.
Binary file not shown.
BIN
lib/libwakenet.a
BIN
lib/libwakenet.a
Binary file not shown.
99
main/Kconfig
99
main/Kconfig
@ -54,11 +54,15 @@ config SR_WN5X2_NIHAOXIAOZHI
|
||||
bool "nihaoxiaozhi (WakeNet5X2)"
|
||||
depends on SR_MODEL_WN5_QUANT || SR_MODEL_WN5_FLOAT
|
||||
|
||||
config SR_WN5X3_NIHAOXIAOXIN
|
||||
bool "nihaoxiaoxin (WakeNet5X3)"
|
||||
depends on SR_MODEL_WN5_QUANT || SR_MODEL_WN5_FLOAT
|
||||
|
||||
config SR_WN5_CUSTOMIZED_WORD
|
||||
bool "customized word (WakeNet5)"
|
||||
depends on SR_MODEL_WN5_QUANT || SR_MODEL_WN5_FLOAT
|
||||
|
||||
config SR_WN6_HILEXIN
|
||||
config SR_WN6_NIHAOXIAOXIN
|
||||
bool "nihaoxiaoxin (WakeNet6)"
|
||||
depends on SR_MODEL_WN6_QUANT
|
||||
|
||||
@ -68,32 +72,57 @@ config SR_WN6_CUSTOMIZED_WORD
|
||||
|
||||
endchoice
|
||||
|
||||
choice SR_MN_MODEL_SEL
|
||||
prompt "speech commands recognition model to use"
|
||||
default CONFIG_MN1_MODEL_FLOAT
|
||||
help
|
||||
Select the model to be used.
|
||||
choice SR_RUN_WN6_CORE
|
||||
|
||||
config SR_MN1_MODEL_QUANT
|
||||
bool "MultiNet 1 (quantized)"
|
||||
depends on SR_MODEL_WN6_QUANT || SR_MODEL_WN6_FLOAT
|
||||
|
||||
prompt "ESP32 core to run WakeNet6"
|
||||
default SR_RUN_WM6_CORE1
|
||||
help
|
||||
Select one ESP32 core to run WakeNet6.
|
||||
|
||||
config SR_RUN_WN6_CORE0
|
||||
bool "core 0"
|
||||
|
||||
config SR_RUN_WN6_CORE1
|
||||
bool "core 1"
|
||||
|
||||
endchoice
|
||||
|
||||
choice SR_MN_MODEL_SEL
|
||||
prompt "speech commands recognition model to use"
|
||||
default CONFIG_MN1_MODEL_FLOAT
|
||||
help
|
||||
Select the model to be used.
|
||||
|
||||
config SR_MN1_MODEL_FLOAT
|
||||
bool "MultiNet 1 (float)"
|
||||
|
||||
config SR_MN1_MODEL_QUANT
|
||||
bool "MultiNet 1 (quantized)"
|
||||
|
||||
endchoice
|
||||
|
||||
|
||||
choice SR_LANGUAGE_SEL
|
||||
prompt "langugae"
|
||||
default SR_MN1_CHINESE
|
||||
help
|
||||
Select the language to be used.
|
||||
prompt "langugae"
|
||||
default SR_MN1_CHINESE
|
||||
help
|
||||
Select the language to be used.
|
||||
|
||||
config SR_MN1_CHINESE_FLOAT
|
||||
bool "chinese (MultiNet1)"
|
||||
depends on SR_MN1_MODEL_FLOAT
|
||||
|
||||
config SR_MN1_CHINESE_QUANT
|
||||
bool "chinese (MultiNet1)"
|
||||
depends on SR_MN1_MODEL_QUANT
|
||||
bool "chinese (MultiNet1)"
|
||||
depends on SR_MN1_MODEL_QUANT
|
||||
|
||||
endchoice
|
||||
|
||||
config SPEECH_COMMANDS_NUM
|
||||
int "The number of speech commands"
|
||||
default 0
|
||||
default 20
|
||||
help
|
||||
The number of the speech commands.
|
||||
|
||||
@ -101,83 +130,83 @@ menu "Add speech commands"
|
||||
|
||||
config SPEECH_COMMAND_ID0
|
||||
string "ID0"
|
||||
default ""
|
||||
default "da kai kong tiao"
|
||||
|
||||
config SPEECH_COMMAND_ID1
|
||||
string "ID1"
|
||||
default ""
|
||||
default "guan bi kong tiao"
|
||||
|
||||
config SPEECH_COMMAND_ID2
|
||||
string "ID2"
|
||||
default ""
|
||||
default "zeng da feng su"
|
||||
|
||||
config SPEECH_COMMAND_ID3
|
||||
string "ID3"
|
||||
default ""
|
||||
default "jian xiao feng su"
|
||||
|
||||
config SPEECH_COMMAND_ID4
|
||||
string "ID4"
|
||||
default ""
|
||||
default "sheng gao yi du"
|
||||
|
||||
config SPEECH_COMMAND_ID5
|
||||
string "ID5"
|
||||
default ""
|
||||
default "jiang di yi du"
|
||||
|
||||
config SPEECH_COMMAND_ID6
|
||||
string "ID6"
|
||||
default ""
|
||||
default "zhi re mo shi"
|
||||
|
||||
config SPEECH_COMMAND_ID7
|
||||
string "ID7"
|
||||
default ""
|
||||
default "zhi leng mo shi"
|
||||
|
||||
config SPEECH_COMMAND_ID8
|
||||
string "ID8"
|
||||
default ""
|
||||
default "song feng mo shi"
|
||||
|
||||
config SPEECH_COMMAND_ID9
|
||||
string "ID9"
|
||||
default ""
|
||||
default "jie neng mo shi"
|
||||
|
||||
config SPEECH_COMMAND_ID10
|
||||
string "ID10"
|
||||
default ""
|
||||
default "guan bi jie neng mo shi"
|
||||
|
||||
config SPEECH_COMMAND_ID11
|
||||
string "ID11"
|
||||
default ""
|
||||
default "chu shi mo shi"
|
||||
|
||||
config SPEECH_COMMAND_ID12
|
||||
string "ID12"
|
||||
default ""
|
||||
default "guan bi chu shi mo shi"
|
||||
|
||||
config SPEECH_COMMAND_ID13
|
||||
string "ID13"
|
||||
default ""
|
||||
default "da kai lan ya"
|
||||
|
||||
config SPEECH_COMMAND_ID14
|
||||
string "ID14"
|
||||
default ""
|
||||
default "guan bi lan ya"
|
||||
|
||||
config SPEECH_COMMAND_ID15
|
||||
string "ID15"
|
||||
default ""
|
||||
default "bo fang ge qu"
|
||||
|
||||
config SPEECH_COMMAND_ID16
|
||||
string "ID16"
|
||||
default ""
|
||||
default "zan ting bo fang"
|
||||
|
||||
config SPEECH_COMMAND_ID17
|
||||
string "ID17"
|
||||
default ""
|
||||
default "ding shi yi xiao shi"
|
||||
|
||||
config SPEECH_COMMAND_ID18
|
||||
string "ID18"
|
||||
default ""
|
||||
default "da kai dian deng"
|
||||
|
||||
config SPEECH_COMMAND_ID19
|
||||
string "ID19"
|
||||
default ""
|
||||
default "guan bi dian deng"
|
||||
|
||||
config SPEECH_COMMAND_ID20
|
||||
string "ID20"
|
||||
|
||||
@ -504,7 +504,6 @@ CONFIG_LWIP_MAX_RAW_PCBS=16
|
||||
#
|
||||
CONFIG_SR_MODEL_WN3_QUANT=
|
||||
CONFIG_SR_MODEL_WN4_QUANT=
|
||||
CONFIG_SR_MODEL_WN5_FLOAT=
|
||||
CONFIG_SR_MODEL_WN5_QUANT=y
|
||||
CONFIG_SR_MODEL_WN6_QUANT=
|
||||
CONFIG_SR_WN5_HILEXIN=
|
||||
@ -512,6 +511,7 @@ CONFIG_SR_WN5X2_HILEXIN=y
|
||||
CONFIG_SR_WN5X3_HILEXIN=
|
||||
CONFIG_SR_WN5_NIHAOXIAOZHI=
|
||||
CONFIG_SR_WN5X2_NIHAOXIAOZHI=
|
||||
CONFIG_SR_WN5X3_NIHAOXIAOXIN=
|
||||
CONFIG_SR_WN5_CUSTOMIZED_WORD=
|
||||
CONFIG_SR_MN1_MODEL_FLOAT=
|
||||
CONFIG_SR_MN1_MODEL_QUANT=y
|
||||
|
||||
@ -29,11 +29,11 @@ Please see the flow diagram of WakeNet below:
|
||||
|
||||
- How to select the WakeNet model
|
||||
|
||||
Go to `make menuconfig`, navigate to `Component config` >> `ESP Speech Recognition` >> `Wake word engine`. See below:
|
||||
|
||||
<center>
|
||||
<img src="../img/model_sel.png" width = "500" />
|
||||
</center>
|
||||
1. Go to `make menuconfig`, navigate to `Component config` >> `ESP Speech Recognition` >> `Wake word engine`. See below:
|
||||
<center> <img src="../img/model_sel.png" width = "500" /> </center>
|
||||
2. WakeNet6 is divided into two tasks, task1 is used to calculate speech recognition, the task2 is used to calculate neural network model. The ESP32 core used to calculate task2 can be selected by `Component config` >> `ESP Speech Recognition` >> `ESP32 core to run WakeNet6`
|
||||
|
||||
|
||||
|
||||
|
||||
- How to select the wake words
|
||||
@ -74,6 +74,9 @@ Please see the flow diagram of WakeNet below:
|
||||
|Quantised WakeNet5|41 K|15 KB|5.5 ms|30 ms|
|
||||
|Quantised WakeNet5X2|165 K|20 KB|10.5 ms|30 ms|
|
||||
|Quantised WakeNet5X3|371 K|24 KB|18 ms|30 ms|
|
||||
|Quantised WakeNet6|378 K|45 KB|4ms(task1) + 25 ms(task2)|30 ms|
|
||||
|
||||
**Note**: Quantised WakeNet6 is split into two tasks, task1 is used to calculate speech features and task2 is used to calculate neural network model.
|
||||
|
||||
### 2. Performance
|
||||
|
||||
|
||||
@ -19,7 +19,7 @@ WakeNet的流程图如下:
|
||||
- wakeNet3和wakeNet4基于[CRNN](https://arxiv.org/abs/1703.05390)结构。
|
||||
- WakeNet5(WakeNet5X2,WakeNetX3) 和 WakeNet6 基于 the [Dilated Convolution](https://arxiv.org/pdf/1609.03499.pdf) 结构。
|
||||
|
||||
注意,WakeNet5,WakeNet5X2 和 WakeNet5X3 的网络结构一致,但是 WakeNetX2 和 WakeNetX3 的参数比 WakeNet5 要多。请参考 [性能测试](#性能测试) 来获取更多细节。
|
||||
注意,WakeNet5,WakeNet5X2 和 WakeNet5X3 的网络结构一致,但是 WakeNet5X2 和 WakeNet5X3 的参数比 WakeNet5 要多。请参考 [性能测试](#性能测试) 来获取更多细节。
|
||||
|
||||
- keyword trigger method:
|
||||
对连续的音频流,为准确判断关键词的触发,我们通过计算若干帧内识别结果的平均值M,来判断触发。当M大于大于指定阈值,发出触发的命令。
|
||||
@ -28,14 +28,16 @@ WakeNet的流程图如下:
|
||||
## API introduction
|
||||
|
||||
- WakeNet模型选择
|
||||
使用make menuconfig,选择Component config >> ESP Speech commands >> Keyword spotting model,如下图
|
||||
1. 使用make menuconfig,选择Component config >> ESP Speech Recognition >> Wake Word Engine,如下图
|
||||
|
||||
<center>
|
||||
<img src="../img/model_sel.png" width = "500" />
|
||||
</center>
|
||||
</center>
|
||||
|
||||
2. 不同与WakeNet5,WakeNet6被拆分成两个task,task1计算speech features,task2计算neural network model。task2使用的ESP32核心,可以通过Component config >> ESP Speech Recognition >> ESP32 core to run WakeNet6选择,默认使用core1。
|
||||
|
||||
- 唤醒词选择
|
||||
使用make menuconfig,选择Component config >> ESP Speech commands >> Wake word list进行选择,如下图
|
||||
使用make menuconfig,选择Component config >> ESP Speech Recognition >> Wake word list进行选择,如下图
|
||||
<center>
|
||||
<img src="../img/word_sel.png" width = "500" />
|
||||
</center>
|
||||
@ -70,7 +72,9 @@ WakeNet的流程图如下:
|
||||
|Quantised WakeNet5|41 K|15 KB|5.5 ms|30 ms|
|
||||
|Quantised WakeNet5X2|165 K|20 KB|10.5 ms|30 ms|
|
||||
|Quantised WakeNet5X3|371 K|24 KB|18 ms|30 ms|
|
||||
|Quantised WakeNet6|378 K|45 KB|4ms(task1)+25ms(task2)|30 ms|
|
||||
|
||||
**注**:Quantised WakeNet6被拆分成两个task,其中task1用于计算speech features,另一个task2用于计算神经网络。
|
||||
|
||||
### 2.识别性能
|
||||
|距离|安静环境|平稳噪声(SNR=0~10dB)|语音噪声(SNR=0~10dB)|AEC打断唤醒(-5~-15dB)|
|
||||
|
||||
@ -62,7 +62,11 @@ extern const esp_wn_iface_t esp_sr_wakenet6_quantized;
|
||||
#include "nihaoxiaozhi_wn5X2.h"
|
||||
#define WAKENET_COEFF get_coeff_nihaoxiaozhi_wn5X2
|
||||
|
||||
#elif CONFIG_SR_WN6_HILEXIN
|
||||
#elif CONFIG_SR_WN5X3_NIHAOXIAOXIN & CONFIG_SR_MODEL_WN5_QUANT
|
||||
#include "nihaoxiaoxin_wn5X3.h"
|
||||
#define WAKENET_COEFF get_coeff_nihaoxiaoxin_wn5X3
|
||||
|
||||
#elif CONFIG_SR_WN6_NIHAOXIAOXIN
|
||||
#include "nihaoxiaoxin_wn6.h"
|
||||
#define WAKENET_COEFF get_coeff_nihaoxiaoxin_wn6
|
||||
|
||||
|
||||
8
wake_word_engine/include/nihaoxiaoxin_wn5X3.h
Normal file
8
wake_word_engine/include/nihaoxiaoxin_wn5X3.h
Normal file
@ -0,0 +1,8 @@
|
||||
//Generated by mkmodel
|
||||
#pragma once
|
||||
#include <string.h>
|
||||
#include "dl_lib_coefgetter_if.h"
|
||||
#include "dl_lib_matrix.h"
|
||||
#include "dl_lib_matrixq.h"
|
||||
|
||||
extern const model_coeff_getter_t get_coeff_nihaoxiaoxin_wn5X3;
|
||||
Binary file not shown.
BIN
wake_word_engine/libnihaoxiaoxin_wn5X3.a
Normal file
BIN
wake_word_engine/libnihaoxiaoxin_wn5X3.a
Normal file
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user