mirror of
https://github.com/espressif/esp-sr.git
synced 2025-09-15 15:28:44 +08:00
Update README.md
This commit is contained in:
parent
74e120a663
commit
ff9ffa5985
@ -1,9 +1,9 @@
|
||||
# esp_sr
|
||||
|
||||
Espressif esp_sr provides basic algorithms for **Speech Interaction** applications. Now, this framework has two models:
|
||||
Espressif esp_sr provides basic algorithms for **Speech Recognition** applications. Now, this framework has two models:
|
||||
|
||||
* The wake word detection model [WakeNet](wake_words_engine/README.md)
|
||||
* The speech commands recognition model [MultiNet](speech_commands_recognition/README.md)
|
||||
* The speech command recognition model [MultiNet](speech_commands_recognition/README.md)
|
||||
|
||||
These algorithms are provided in the form of a component, so they can be integrated into your projects with minimum efforts.
|
||||
|
||||
@ -13,9 +13,9 @@ Espressif wake word engine [WakeNet](wake_words_engine/README.md) is specially d
|
||||
|
||||
Currently, Espressif has not only provided an official wake word "Hi, Lexin" to public for free, but also allows customized wake words. For details on how to customize your own wake words, please see [Espressif Speech Wake Words Customization Process](wake_words_engine/ESP_Wake_Words_Customization.md).
|
||||
|
||||
## Speech Commands Recognition
|
||||
## Speech Command Recognition
|
||||
|
||||
Espressif's speech commands recognition model [MultiNet](speech_commands_recognition/README.md) is specially designed to provide a flexible off-line speech commands recognition model. With this model, you can easily add your own speech commands, eliminating the need to train model again.
|
||||
Espressif's speech command recognition model [MultiNet](speech_commands_recognition/README.md) is specially designed to provide a flexible off-line speech command recognition model. With this model, you can easily add your own speech commands, eliminating the need to train model again.
|
||||
|
||||
Currently, Espressif **MultiNet** supports up to 100 Chinese speech commands, such as “打开空调” (Turn on the air conditioner) and “打开卧室灯” (Turn on the bedroom light).
|
||||
|
||||
|
||||
@ -80,6 +80,7 @@ qtp_t *dl_get_queue_itemq(dl_convq_queue_t *cq, int offset);
|
||||
* @return Pointer of the element
|
||||
*/
|
||||
qtp_t *dl_sigmoid_step_q(dl_convq_queue_t *cq, int offset);
|
||||
void dl_tanh_convq(dl_convq_queue_t *cq, int last_num);
|
||||
|
||||
/**
|
||||
* @brief Does a tanh operation on the one of element in the convolution queue.
|
||||
@ -109,7 +110,7 @@ fptp_t * dl_softmax_step_q(dl_convq_queue_t *cq, int offset, fptp_t *out);
|
||||
* based on convolution queue.
|
||||
*
|
||||
* @Warning All input and output convolution queue and matrix should be allocated. The return pointer
|
||||
* is first element of output queue and should not be freed separately.
|
||||
* is last element of output queue and should not be freed separately.
|
||||
*
|
||||
* @param in Input fixed-point convolution queue
|
||||
* @param out Output fixed-point convolution queue
|
||||
@ -121,8 +122,9 @@ fptp_t * dl_softmax_step_q(dl_convq_queue_t *cq, int offset, fptp_t *out);
|
||||
* @return The result of atrous convolution
|
||||
*/
|
||||
qtp_t *dl_atrous_conv1dq_step(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
|
||||
dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int shift);
|
||||
qtp_t *dl_atrous_conv1dq(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
|
||||
dl_matrix2dq_t* kernel, dl_matrix2dq_t* bias, int shift);
|
||||
|
||||
/**
|
||||
* @brief Fast implement of dilation layer as follows
|
||||
*
|
||||
@ -131,7 +133,7 @@ qtp_t *dl_atrous_conv1dq_step(dl_convq_queue_t *in, dl_convq_queue_t *out, int r
|
||||
* |-> [filter(tanh)] -|
|
||||
*
|
||||
* @Warning All input and output convolution queue and matrix should be allocated. The return pointer
|
||||
* is first element of output queue and should not be freed separately.
|
||||
* is last element of output queue and should not be freed separately.
|
||||
*
|
||||
* @param in Input fixed-point convolution queue
|
||||
* @param out Output fixed-point convolution queue
|
||||
@ -145,7 +147,11 @@ qtp_t *dl_atrous_conv1dq_step(dl_convq_queue_t *in, dl_convq_queue_t *out, int r
|
||||
* @gate_shift Shift ratio used in gate operation between two 16-bit fixed point vector
|
||||
* @return The result of dilation layer
|
||||
*/
|
||||
qtp_t *dl_dilation_layer_q(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
|
||||
qtp_t *dl_dilation_layerq_step(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
|
||||
dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
|
||||
dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias,
|
||||
int filter_shift, int gate_shift);
|
||||
qtp_t *dl_dilation_layerq(dl_convq_queue_t *in, dl_convq_queue_t *out, int rate, int size,
|
||||
dl_matrix2dq_t* filter_kernel, dl_matrix2dq_t* filter_bias,
|
||||
dl_matrix2dq_t* gate_kernel, dl_matrix2dq_t* gate_bias,
|
||||
int filter_shift, int gate_shift);
|
||||
|
||||
Binary file not shown.
BIN
lib/libdl_lib.a
BIN
lib/libdl_lib.a
Binary file not shown.
Binary file not shown.
BIN
lib/libwakenet.a
BIN
lib/libwakenet.a
Binary file not shown.
435
main/Kconfig
435
main/Kconfig
@ -4,7 +4,7 @@ choice SR_MODEL_SEL
|
||||
prompt "Wake word engine"
|
||||
default SR_MODEL_WN5_QUANT
|
||||
help
|
||||
Select the keyword spotting model to be used.
|
||||
Select the Wake Word Engine to be used.
|
||||
|
||||
config SR_MODEL_WN3_QUANT
|
||||
bool "WakeNet 3 (quantized)"
|
||||
@ -43,7 +43,7 @@ config SR_WN5_CUSTOMIZED_WORD
|
||||
depends on SR_MODEL_WN5_QUANT || SR_MODEL_WN5_FLOAT
|
||||
|
||||
config SR_WN6_HILEXIN
|
||||
bool "hilexin (WakeNet6)"
|
||||
bool "nihaoxiaoxin (WakeNet6)"
|
||||
depends on SR_MODEL_WN6_QUANT
|
||||
|
||||
config SR_WN6_CUSTOMIZED_WORD
|
||||
@ -53,27 +53,436 @@ config SR_WN6_CUSTOMIZED_WORD
|
||||
endchoice
|
||||
|
||||
choice SR_MN_MODEL_SEL
|
||||
prompt "speech commands recognition model to use"
|
||||
default CONFIG_MN1_MODEL_QUANT
|
||||
help
|
||||
Select the model to be used.
|
||||
prompt "speech commands recognition model to use"
|
||||
default CONFIG_MN1_MODEL_FLOAT
|
||||
help
|
||||
Select the model to be used.
|
||||
|
||||
config SR_MN1_MODEL_QUANT
|
||||
bool "MultiNet 1 (quantized)"
|
||||
bool "MultiNet 1 (quantized)"
|
||||
|
||||
endchoice
|
||||
|
||||
choice SR_LANGUAGE_SEL
|
||||
prompt "langugae"
|
||||
default SR_MN1_CHINESE
|
||||
help
|
||||
Select the language to be used.
|
||||
prompt "langugae"
|
||||
default SR_MN1_CHINESE
|
||||
help
|
||||
Select the language to be used.
|
||||
|
||||
config SR_MN1_CHINESE_QUANT
|
||||
bool "chinese (MultiNet1)"
|
||||
depends on SR_MN1_MODEL_QUANT
|
||||
bool "chinese (MultiNet1)"
|
||||
depends on SR_MN1_MODEL_QUANT
|
||||
|
||||
endchoice
|
||||
|
||||
config SPEECH_COMMANDS_NUM
|
||||
int "The number of speech commands"
|
||||
default 0
|
||||
help
|
||||
The number of the speech commands.
|
||||
|
||||
menu "Add speech commands"
|
||||
|
||||
config SPEECH_COMMAND_ID0
|
||||
string "ID0"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID1
|
||||
string "ID1"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID2
|
||||
string "ID2"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID3
|
||||
string "ID3"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID4
|
||||
string "ID4"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID5
|
||||
string "ID5"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID6
|
||||
string "ID6"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID7
|
||||
string "ID7"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID8
|
||||
string "ID8"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID9
|
||||
string "ID9"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID10
|
||||
string "ID10"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID11
|
||||
string "ID11"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID12
|
||||
string "ID12"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID13
|
||||
string "ID13"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID14
|
||||
string "ID14"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID15
|
||||
string "ID15"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID16
|
||||
string "ID16"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID17
|
||||
string "ID17"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID18
|
||||
string "ID18"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID19
|
||||
string "ID19"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID20
|
||||
string "ID20"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID21
|
||||
string "ID21"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID22
|
||||
string "ID22"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID23
|
||||
string "ID23"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID24
|
||||
string "ID24"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID25
|
||||
string "ID25"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID26
|
||||
string "ID26"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID27
|
||||
string "ID27"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID28
|
||||
string "ID28"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID29
|
||||
string "ID29"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID30
|
||||
string "ID30"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID31
|
||||
string "ID31"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID32
|
||||
string "ID32"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID33
|
||||
string "ID33"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID34
|
||||
string "ID34"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID35
|
||||
string "ID35"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID36
|
||||
string "ID36"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID37
|
||||
string "ID37"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID38
|
||||
string "ID38"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID39
|
||||
string "ID39"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID40
|
||||
string "ID40"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID41
|
||||
string "ID41"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID42
|
||||
string "ID42"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID43
|
||||
string "ID43"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID44
|
||||
string "ID44"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID45
|
||||
string "ID45"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID46
|
||||
string "ID46"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID47
|
||||
string "ID47"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID48
|
||||
string "ID48"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID49
|
||||
string "ID49"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID50
|
||||
string "ID50"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID51
|
||||
string "ID51"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID52
|
||||
string "ID52"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID53
|
||||
string "ID53"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID54
|
||||
string "ID54"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID55
|
||||
string "ID55"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID56
|
||||
string "ID56"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID57
|
||||
string "ID57"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID58
|
||||
string "ID58"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID59
|
||||
string "ID59"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID60
|
||||
string "ID60"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID61
|
||||
string "ID61"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID62
|
||||
string "ID62"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID63
|
||||
string "ID63"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID64
|
||||
string "ID64"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID65
|
||||
string "ID65"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID66
|
||||
string "ID66"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID67
|
||||
string "ID67"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID68
|
||||
string "ID68"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID69
|
||||
string "ID69"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID70
|
||||
string "ID70"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID71
|
||||
string "ID71"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID72
|
||||
string "ID72"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID73
|
||||
string "ID73"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID74
|
||||
string "ID74"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID75
|
||||
string "ID75"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID76
|
||||
string "ID76"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID77
|
||||
string "ID77"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID78
|
||||
string "ID78"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID79
|
||||
string "ID79"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID80
|
||||
string "ID80"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID81
|
||||
string "ID81"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID82
|
||||
string "ID82"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID83
|
||||
string "ID83"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID84
|
||||
string "ID84"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID85
|
||||
string "ID85"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID86
|
||||
string "ID86"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID87
|
||||
string "ID87"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID88
|
||||
string "ID88"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID89
|
||||
string "ID89"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID90
|
||||
string "ID90"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID91
|
||||
string "ID91"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID92
|
||||
string "ID92"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID93
|
||||
string "ID93"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID94
|
||||
string "ID94"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID95
|
||||
string "ID95"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID96
|
||||
string "ID96"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID97
|
||||
string "ID97"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID98
|
||||
string "ID98"
|
||||
default ""
|
||||
|
||||
config SPEECH_COMMAND_ID99
|
||||
string "ID99"
|
||||
default ""
|
||||
|
||||
endmenu
|
||||
|
||||
endmenu
|
||||
|
||||
@ -8,12 +8,13 @@
|
||||
|
||||
#include "wakenet_test.h"
|
||||
#include "multinet_test.h"
|
||||
|
||||
void app_main()
|
||||
{
|
||||
// test wakenet
|
||||
wakenet_test();
|
||||
vTaskDelay(3000 / portTICK_PERIOD_MS);
|
||||
|
||||
// //test multinet
|
||||
//test multinet
|
||||
multinet_test();
|
||||
}
|
||||
|
||||
@ -16,6 +16,7 @@ void multinetTask(void *arg)
|
||||
model_iface_data_t *model_data = arg;
|
||||
int frequency = multinet->get_samp_rate(model_data);
|
||||
int audio_chunksize = multinet->get_samp_chunksize(model_data);
|
||||
int chunk_num = multinet->get_samp_chunknum(model_data);
|
||||
int16_t *buffer = malloc(audio_chunksize * sizeof(int16_t));
|
||||
assert(buffer);
|
||||
int chunks = 0;
|
||||
@ -25,11 +26,11 @@ void multinetTask(void *arg)
|
||||
} else {
|
||||
memset(buffer, 0, audio_chunksize*sizeof(int16_t));
|
||||
}
|
||||
int commend_id = multinet->detect(model_data, buffer);
|
||||
int command_id = multinet->detect(model_data, buffer);
|
||||
chunks++;
|
||||
if (chunks == 200 || commend_id > -1) {
|
||||
if (commend_id > -1) {
|
||||
printf("MN test successfully, Commands ID: %d.\n", commend_id);
|
||||
if (chunks == chunk_num || command_id > -1) {
|
||||
if (command_id > -1) {
|
||||
printf("MN test successfully, Commands ID: %d.\n", command_id);
|
||||
} else {
|
||||
printf("can not recognize any speech commands\n");
|
||||
}
|
||||
@ -40,50 +41,17 @@ void multinetTask(void *arg)
|
||||
vTaskDelete(NULL);
|
||||
}
|
||||
|
||||
void add_speech_commands(esp_mn_iface_t *multinet, model_iface_data_t *model_data)
|
||||
{
|
||||
char *phrase_spelling[20];
|
||||
phrase_spelling[0] = "d,a,k,ai,k,ong,ti,ao";
|
||||
phrase_spelling[1] = "gu,an,b,i,k,ong,ti,ao";
|
||||
phrase_spelling[2] = "z,eng,d,a,f,eng,s,u";
|
||||
phrase_spelling[3] = "ji,an,xi,ao,f,eng,s,u";
|
||||
phrase_spelling[4] = "sh,eng,g,ao,y,i,d,u";
|
||||
phrase_spelling[5] = "ji,ang,d,i,y,i,d,u";
|
||||
phrase_spelling[6] = "zh,i,r,e,m,o,sh,i";
|
||||
phrase_spelling[7] = "zh,i,l,eng,m,o,sh,i";
|
||||
phrase_spelling[8] = "s,ong,f,eng,m,o,sh,i";
|
||||
phrase_spelling[9] = "j,ie,n,eng,m,o,sh,i";
|
||||
|
||||
phrase_spelling[10] = "gu,an,b,i,j,ie,n,eng,m,o,sh,i";
|
||||
phrase_spelling[11] = "ch,u,sh,i,m,o,sh,i";
|
||||
phrase_spelling[12] = "gu,an,b,i,ch,u,sh,i";
|
||||
phrase_spelling[13] = "d,a,k,ai,l,an,y,a";
|
||||
phrase_spelling[14] = "gu,an,b,i,l,an,y,a";
|
||||
phrase_spelling[15] = "b,o,f,ang,g,e,q,u";
|
||||
phrase_spelling[16] = "z,an,t,ing,b,o,f,ang";
|
||||
phrase_spelling[17] = "d,ing,sh,i,y,i,xi,ao,sh,i";
|
||||
phrase_spelling[18] = "d,a,k,ai,di,an,d,eng";
|
||||
phrase_spelling[19] = "gu,an,b,i,di,an,d,eng";
|
||||
|
||||
printf("start to add commands:\n");
|
||||
for (int i = 0; i < 20; i++) {
|
||||
printf("commend %d: %s\n",i+1, phrase_spelling[i]);
|
||||
multinet->add_speech_commands(model_data, i, phrase_spelling[i], NULL);
|
||||
}
|
||||
}
|
||||
|
||||
void multinet_test()
|
||||
{
|
||||
int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
|
||||
printf("Start free RAM size: %d\n", start_size);
|
||||
|
||||
//Initialize multinet model
|
||||
model_iface_data_t *model_data = multinet->create(&MULTINET_COEFF);
|
||||
add_speech_commands(multinet, model_data);
|
||||
model_iface_data_t *model_data = multinet->create(&MULTINET_COEFF, 6000);
|
||||
|
||||
//define_speech_commands(multinet, model_data);
|
||||
int audio_chunksize = multinet->get_samp_chunksize(model_data);
|
||||
printf("multinet RAM size: %d\n, current RAM size after multinet init: %d\n",
|
||||
printf("multinet RAM size: %d\nRAM size after multinet init: %d\n",
|
||||
start_size - heap_caps_get_free_size(MALLOC_CAP_8BIT), heap_caps_get_free_size(MALLOC_CAP_8BIT));
|
||||
|
||||
xTaskCreatePinnedToCore(&multinetTask, "multinet", 2 * 1024, (void*)model_data, 5, NULL, 0);
|
||||
|
||||
@ -8,6 +8,7 @@
|
||||
#include "dl_lib_coefgetter_if.h"
|
||||
#include "wakenet_test.h"
|
||||
#include "hilexin.h"
|
||||
#include <sys/time.h>
|
||||
|
||||
static const esp_wn_iface_t *wakenet = &WAKENET_MODEL;
|
||||
static const model_coeff_getter_t *model_coeff_getter = &WAKENET_COEFF;
|
||||
@ -21,6 +22,8 @@ void wakenetTask(void *arg)
|
||||
assert(buffer);
|
||||
|
||||
int chunks = 0;
|
||||
struct timeval tv_start, tv_end;
|
||||
gettimeofday(&tv_start, NULL);
|
||||
while (1) {
|
||||
if ((chunks + 1)*audio_chunksize * sizeof(int16_t) <= sizeof(hilexin)) {
|
||||
memcpy(buffer, hilexin + chunks * audio_chunksize * sizeof(int16_t), audio_chunksize * sizeof(int16_t));
|
||||
@ -34,6 +37,10 @@ void wakenetTask(void *arg)
|
||||
}
|
||||
chunks++;
|
||||
}
|
||||
gettimeofday(&tv_end, NULL);
|
||||
int tv_ms=(tv_end.tv_sec-tv_start.tv_sec)*1000+(tv_end.tv_usec-tv_start.tv_usec)/1000;
|
||||
printf("Done! Took %d ms to parse %d ms worth of samples in %d iterations. CPU loading(single core):%.1f%%\n",
|
||||
tv_ms, chunks*30, chunks, tv_ms*1.0/chunks/3*10);
|
||||
printf("TEST FINISHED\n\n");
|
||||
vTaskDelete(NULL);
|
||||
}
|
||||
|
||||
@ -20,6 +20,7 @@ CONFIG_LOG_BOOTLOADER_LEVEL_INFO=y
|
||||
CONFIG_LOG_BOOTLOADER_LEVEL_DEBUG=
|
||||
CONFIG_LOG_BOOTLOADER_LEVEL_VERBOSE=
|
||||
CONFIG_LOG_BOOTLOADER_LEVEL=3
|
||||
CONFIG_BOOTLOADER_SPI_WP_PIN=7
|
||||
CONFIG_BOOTLOADER_VDDSDIO_BOOST_1_9V=y
|
||||
CONFIG_BOOTLOADER_FACTORY_RESET=
|
||||
CONFIG_BOOTLOADER_APP_TEST=
|
||||
@ -38,17 +39,17 @@ CONFIG_FLASH_ENCRYPTION_ENABLED=
|
||||
# Serial flasher config
|
||||
#
|
||||
CONFIG_ESPTOOLPY_PORT="/dev/ttyUSB0"
|
||||
CONFIG_ESPTOOLPY_BAUD_115200B=y
|
||||
CONFIG_ESPTOOLPY_BAUD_115200B=
|
||||
CONFIG_ESPTOOLPY_BAUD_230400B=
|
||||
CONFIG_ESPTOOLPY_BAUD_921600B=
|
||||
CONFIG_ESPTOOLPY_BAUD_921600B=y
|
||||
CONFIG_ESPTOOLPY_BAUD_2MB=
|
||||
CONFIG_ESPTOOLPY_BAUD_OTHER=
|
||||
CONFIG_ESPTOOLPY_BAUD_OTHER_VAL=115200
|
||||
CONFIG_ESPTOOLPY_BAUD=115200
|
||||
CONFIG_ESPTOOLPY_BAUD=921600
|
||||
CONFIG_ESPTOOLPY_COMPRESSED=y
|
||||
CONFIG_FLASHMODE_QIO=
|
||||
CONFIG_FLASHMODE_QIO=y
|
||||
CONFIG_FLASHMODE_QOUT=
|
||||
CONFIG_FLASHMODE_DIO=y
|
||||
CONFIG_FLASHMODE_DIO=
|
||||
CONFIG_FLASHMODE_DOUT=
|
||||
CONFIG_ESPTOOLPY_FLASHMODE="dio"
|
||||
CONFIG_ESPTOOLPY_FLASHFREQ_80M=y
|
||||
@ -58,10 +59,10 @@ CONFIG_ESPTOOLPY_FLASHFREQ_20M=
|
||||
CONFIG_ESPTOOLPY_FLASHFREQ="80m"
|
||||
CONFIG_ESPTOOLPY_FLASHSIZE_1MB=
|
||||
CONFIG_ESPTOOLPY_FLASHSIZE_2MB=
|
||||
CONFIG_ESPTOOLPY_FLASHSIZE_4MB=y
|
||||
CONFIG_ESPTOOLPY_FLASHSIZE_8MB=
|
||||
CONFIG_ESPTOOLPY_FLASHSIZE_4MB=
|
||||
CONFIG_ESPTOOLPY_FLASHSIZE_8MB=y
|
||||
CONFIG_ESPTOOLPY_FLASHSIZE_16MB=
|
||||
CONFIG_ESPTOOLPY_FLASHSIZE="4MB"
|
||||
CONFIG_ESPTOOLPY_FLASHSIZE="8MB"
|
||||
CONFIG_ESPTOOLPY_FLASHSIZE_DETECT=y
|
||||
CONFIG_ESPTOOLPY_BEFORE_RESET=y
|
||||
CONFIG_ESPTOOLPY_BEFORE_NORESET=
|
||||
@ -161,7 +162,7 @@ CONFIG_SPIRAM_SUPPORT=y
|
||||
# SPI RAM config
|
||||
#
|
||||
CONFIG_SPIRAM_BOOT_INIT=y
|
||||
CONFIG_SPIRAM_IGNORE_NOTFOUND=
|
||||
CONFIG_SPIRAM_IGNORE_NOTFOUND=y
|
||||
CONFIG_SPIRAM_USE_MEMMAP=
|
||||
CONFIG_SPIRAM_USE_CAPS_ALLOC=
|
||||
CONFIG_SPIRAM_USE_MALLOC=y
|
||||
@ -195,7 +196,7 @@ CONFIG_TWO_UNIVERSAL_MAC_ADDRESS=
|
||||
CONFIG_FOUR_UNIVERSAL_MAC_ADDRESS=y
|
||||
CONFIG_NUMBER_OF_UNIVERSAL_MAC_ADDRESS=4
|
||||
CONFIG_SYSTEM_EVENT_QUEUE_SIZE=32
|
||||
CONFIG_SYSTEM_EVENT_TASK_STACK_SIZE=2304
|
||||
CONFIG_SYSTEM_EVENT_TASK_STACK_SIZE=2048
|
||||
CONFIG_MAIN_TASK_STACK_SIZE=3584
|
||||
CONFIG_IPC_TASK_STACK_SIZE=1024
|
||||
CONFIG_TIMER_TASK_STACK_SIZE=3584
|
||||
@ -311,7 +312,7 @@ CONFIG_HTTPD_MAX_URI_LEN=512
|
||||
#
|
||||
CONFIG_DMA_RX_BUF_NUM=10
|
||||
CONFIG_DMA_TX_BUF_NUM=10
|
||||
CONFIG_EMAC_L2_TO_L3_RX_BUF_MODE=y
|
||||
CONFIG_EMAC_L2_TO_L3_RX_BUF_MODE=
|
||||
CONFIG_EMAC_CHECK_LINK_PERIOD_MS=2000
|
||||
CONFIG_EMAC_TASK_PRIORITY=20
|
||||
CONFIG_EMAC_TASK_STACK_SIZE=3072
|
||||
@ -383,7 +384,7 @@ CONFIG_FREERTOS_THREAD_LOCAL_STORAGE_POINTERS=1
|
||||
CONFIG_FREERTOS_ASSERT_FAIL_ABORT=y
|
||||
CONFIG_FREERTOS_ASSERT_FAIL_PRINT_CONTINUE=
|
||||
CONFIG_FREERTOS_ASSERT_DISABLE=
|
||||
CONFIG_FREERTOS_IDLE_TASK_STACKSIZE=1536
|
||||
CONFIG_FREERTOS_IDLE_TASK_STACKSIZE=1024
|
||||
CONFIG_FREERTOS_ISR_STACKSIZE=1536
|
||||
CONFIG_FREERTOS_LEGACY_HOOKS=
|
||||
CONFIG_FREERTOS_MAX_TASK_NAME_LEN=16
|
||||
@ -393,8 +394,12 @@ CONFIG_TIMER_TASK_PRIORITY=1
|
||||
CONFIG_TIMER_TASK_STACK_DEPTH=2048
|
||||
CONFIG_TIMER_QUEUE_LENGTH=10
|
||||
CONFIG_FREERTOS_QUEUE_REGISTRY_SIZE=0
|
||||
CONFIG_FREERTOS_USE_TRACE_FACILITY=
|
||||
CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS=
|
||||
CONFIG_FREERTOS_USE_TRACE_FACILITY=y
|
||||
CONFIG_FREERTOS_USE_STATS_FORMATTING_FUNCTIONS=y
|
||||
CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID=
|
||||
CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS=y
|
||||
CONFIG_FREERTOS_RUN_TIME_STATS_USING_ESP_TIMER=y
|
||||
CONFIG_FREERTOS_RUN_TIME_STATS_USING_CPU_CLK=
|
||||
CONFIG_FREERTOS_DEBUG_INTERNALS=
|
||||
CONFIG_FREERTOS_TASK_FUNCTION_WRAPPER=y
|
||||
|
||||
@ -437,7 +442,7 @@ CONFIG_LWIP_DHCP_MAX_NTP_SERVERS=1
|
||||
CONFIG_LWIP_IP_FRAG=
|
||||
CONFIG_LWIP_IP_REASSEMBLY=
|
||||
CONFIG_LWIP_STATS=
|
||||
CONFIG_LWIP_ETHARP_TRUST_IP_MAC=
|
||||
CONFIG_LWIP_ETHARP_TRUST_IP_MAC=y
|
||||
CONFIG_ESP_GRATUITOUS_ARP=y
|
||||
CONFIG_GARP_TMR_INTERVAL=60
|
||||
CONFIG_TCPIP_RECVMBOX_SIZE=32
|
||||
@ -476,7 +481,7 @@ CONFIG_TCP_OVERSIZE_DISABLE=
|
||||
#
|
||||
CONFIG_LWIP_MAX_UDP_PCBS=16
|
||||
CONFIG_UDP_RECVMBOX_SIZE=6
|
||||
CONFIG_TCPIP_TASK_STACK_SIZE=3072
|
||||
CONFIG_TCPIP_TASK_STACK_SIZE=2048
|
||||
CONFIG_TCPIP_TASK_AFFINITY_NO_AFFINITY=y
|
||||
CONFIG_TCPIP_TASK_AFFINITY_CPU0=
|
||||
CONFIG_TCPIP_TASK_AFFINITY_CPU1=
|
||||
@ -499,12 +504,119 @@ CONFIG_LWIP_MAX_RAW_PCBS=16
|
||||
#
|
||||
CONFIG_SR_MODEL_WN3_QUANT=
|
||||
CONFIG_SR_MODEL_WN4_QUANT=
|
||||
CONFIG_SR_MODEL_WN5_QUANT=
|
||||
CONFIG_SR_MODEL_WN6_QUANT=y
|
||||
CONFIG_SR_WN6_HILEXIN=y
|
||||
CONFIG_SR_WN6_CUSTOMIZED_WORD=
|
||||
CONFIG_SR_MODEL_WN5_FLOAT=
|
||||
CONFIG_SR_MODEL_WN5_QUANT=y
|
||||
CONFIG_SR_MODEL_WN6_QUANT=
|
||||
CONFIG_SR_WN5_HILEXIN=y
|
||||
CONFIG_SR_WN5_CUSTOMIZED_WORD=
|
||||
CONFIG_SR_MN1_MODEL_FLOAT=
|
||||
CONFIG_SR_MN1_MODEL_QUANT=y
|
||||
CONFIG_SR_MN1_CHINESE_QUANT=y
|
||||
CONFIG_SPEECH_COMMANDS_NUM=20
|
||||
|
||||
#
|
||||
# Add speech commands
|
||||
#
|
||||
CONFIG_SPEECH_COMMAND_ID0="da kai kong tiao"
|
||||
CONFIG_SPEECH_COMMAND_ID1="guan bi kong tiao"
|
||||
CONFIG_SPEECH_COMMAND_ID2="zeng da feng su"
|
||||
CONFIG_SPEECH_COMMAND_ID3="jian xiao feng su"
|
||||
CONFIG_SPEECH_COMMAND_ID4="sheng gao yi du"
|
||||
CONFIG_SPEECH_COMMAND_ID5="jiang di yi du"
|
||||
CONFIG_SPEECH_COMMAND_ID6="zhi re mo shi"
|
||||
CONFIG_SPEECH_COMMAND_ID7="zhi leng mo shi"
|
||||
CONFIG_SPEECH_COMMAND_ID8="song feng mo shi"
|
||||
CONFIG_SPEECH_COMMAND_ID9="jie neng mo shi"
|
||||
CONFIG_SPEECH_COMMAND_ID10="guan bi jie neng mo shi"
|
||||
CONFIG_SPEECH_COMMAND_ID11="chu shi mo shi"
|
||||
CONFIG_SPEECH_COMMAND_ID12="guan bi chu shi mo shi"
|
||||
CONFIG_SPEECH_COMMAND_ID13="da kai lan ya"
|
||||
CONFIG_SPEECH_COMMAND_ID14="guan bi lan ya"
|
||||
CONFIG_SPEECH_COMMAND_ID15="bo fang ge qu"
|
||||
CONFIG_SPEECH_COMMAND_ID16="zan ting bo fang"
|
||||
CONFIG_SPEECH_COMMAND_ID17="ding shi yi xiao shi"
|
||||
CONFIG_SPEECH_COMMAND_ID18="da kai dian deng"
|
||||
CONFIG_SPEECH_COMMAND_ID19="guan bi dian deng"
|
||||
CONFIG_SPEECH_COMMAND_ID20=""
|
||||
CONFIG_SPEECH_COMMAND_ID21=""
|
||||
CONFIG_SPEECH_COMMAND_ID22=""
|
||||
CONFIG_SPEECH_COMMAND_ID23=""
|
||||
CONFIG_SPEECH_COMMAND_ID24=""
|
||||
CONFIG_SPEECH_COMMAND_ID25=""
|
||||
CONFIG_SPEECH_COMMAND_ID26=""
|
||||
CONFIG_SPEECH_COMMAND_ID27=""
|
||||
CONFIG_SPEECH_COMMAND_ID28=""
|
||||
CONFIG_SPEECH_COMMAND_ID29=""
|
||||
CONFIG_SPEECH_COMMAND_ID30=""
|
||||
CONFIG_SPEECH_COMMAND_ID31=""
|
||||
CONFIG_SPEECH_COMMAND_ID32=""
|
||||
CONFIG_SPEECH_COMMAND_ID33=""
|
||||
CONFIG_SPEECH_COMMAND_ID34=""
|
||||
CONFIG_SPEECH_COMMAND_ID35=""
|
||||
CONFIG_SPEECH_COMMAND_ID36=""
|
||||
CONFIG_SPEECH_COMMAND_ID37=""
|
||||
CONFIG_SPEECH_COMMAND_ID38=""
|
||||
CONFIG_SPEECH_COMMAND_ID39=""
|
||||
CONFIG_SPEECH_COMMAND_ID40=""
|
||||
CONFIG_SPEECH_COMMAND_ID41=""
|
||||
CONFIG_SPEECH_COMMAND_ID42=""
|
||||
CONFIG_SPEECH_COMMAND_ID43=""
|
||||
CONFIG_SPEECH_COMMAND_ID44=""
|
||||
CONFIG_SPEECH_COMMAND_ID45=""
|
||||
CONFIG_SPEECH_COMMAND_ID46=""
|
||||
CONFIG_SPEECH_COMMAND_ID47=""
|
||||
CONFIG_SPEECH_COMMAND_ID48=""
|
||||
CONFIG_SPEECH_COMMAND_ID49=""
|
||||
CONFIG_SPEECH_COMMAND_ID50=""
|
||||
CONFIG_SPEECH_COMMAND_ID51=""
|
||||
CONFIG_SPEECH_COMMAND_ID52=""
|
||||
CONFIG_SPEECH_COMMAND_ID53=""
|
||||
CONFIG_SPEECH_COMMAND_ID54=""
|
||||
CONFIG_SPEECH_COMMAND_ID55=""
|
||||
CONFIG_SPEECH_COMMAND_ID56=""
|
||||
CONFIG_SPEECH_COMMAND_ID57=""
|
||||
CONFIG_SPEECH_COMMAND_ID58=""
|
||||
CONFIG_SPEECH_COMMAND_ID59=""
|
||||
CONFIG_SPEECH_COMMAND_ID60=""
|
||||
CONFIG_SPEECH_COMMAND_ID61=""
|
||||
CONFIG_SPEECH_COMMAND_ID62=""
|
||||
CONFIG_SPEECH_COMMAND_ID63=""
|
||||
CONFIG_SPEECH_COMMAND_ID64=""
|
||||
CONFIG_SPEECH_COMMAND_ID65=""
|
||||
CONFIG_SPEECH_COMMAND_ID66=""
|
||||
CONFIG_SPEECH_COMMAND_ID67=""
|
||||
CONFIG_SPEECH_COMMAND_ID68=""
|
||||
CONFIG_SPEECH_COMMAND_ID69=""
|
||||
CONFIG_SPEECH_COMMAND_ID70=""
|
||||
CONFIG_SPEECH_COMMAND_ID71=""
|
||||
CONFIG_SPEECH_COMMAND_ID72=""
|
||||
CONFIG_SPEECH_COMMAND_ID73=""
|
||||
CONFIG_SPEECH_COMMAND_ID74=""
|
||||
CONFIG_SPEECH_COMMAND_ID75=""
|
||||
CONFIG_SPEECH_COMMAND_ID76=""
|
||||
CONFIG_SPEECH_COMMAND_ID77=""
|
||||
CONFIG_SPEECH_COMMAND_ID78=""
|
||||
CONFIG_SPEECH_COMMAND_ID79=""
|
||||
CONFIG_SPEECH_COMMAND_ID80=""
|
||||
CONFIG_SPEECH_COMMAND_ID81=""
|
||||
CONFIG_SPEECH_COMMAND_ID82=""
|
||||
CONFIG_SPEECH_COMMAND_ID83=""
|
||||
CONFIG_SPEECH_COMMAND_ID84=""
|
||||
CONFIG_SPEECH_COMMAND_ID85=""
|
||||
CONFIG_SPEECH_COMMAND_ID86=""
|
||||
CONFIG_SPEECH_COMMAND_ID87=""
|
||||
CONFIG_SPEECH_COMMAND_ID88=""
|
||||
CONFIG_SPEECH_COMMAND_ID89=""
|
||||
CONFIG_SPEECH_COMMAND_ID90=""
|
||||
CONFIG_SPEECH_COMMAND_ID91=""
|
||||
CONFIG_SPEECH_COMMAND_ID92=""
|
||||
CONFIG_SPEECH_COMMAND_ID93=""
|
||||
CONFIG_SPEECH_COMMAND_ID94=""
|
||||
CONFIG_SPEECH_COMMAND_ID95=""
|
||||
CONFIG_SPEECH_COMMAND_ID96=""
|
||||
CONFIG_SPEECH_COMMAND_ID97=""
|
||||
CONFIG_SPEECH_COMMAND_ID98=""
|
||||
CONFIG_SPEECH_COMMAND_ID99=""
|
||||
|
||||
#
|
||||
# mbedTLS
|
||||
@ -4,7 +4,7 @@ MultiNet is a lightweight model specially designed based on [CRNN](https://arxiv
|
||||
|
||||
## Overview
|
||||
|
||||
MultiNet uses the **MFCC features** of an audio clip as input, and the **phonemes** (Chinese or English) as output. By comparing the output phonemes, the relevant Chinese or English word is identified.
|
||||
MultiNet uses the **MFCC features** of an audio clip as input, and the **phonemes** (Chinese or English) as output. By comparing the output phonemes, the relevant Chinese or English command is identified.
|
||||
|
||||
## Commands Recognition Process
|
||||
|
||||
@ -13,7 +13,7 @@ MultiNet uses the **MFCC features** of an audio clip as input, and the **phoneme
|
||||
3. Input this audio to the MFCC model and get its **MFCC features**.
|
||||
4. Input the obtained **MFCC features** to MultiNet and get the output **phoneme**.
|
||||
5. Input the obtained **phoneme** to the Language model and get the output.
|
||||
6. Compare the output against the existing command words one by one, and output the Command ID of the matching command (if any).
|
||||
6. Compare the output against the existing speech commands one by one, and output the Command ID of the matching command (if any).
|
||||
|
||||
Please see the flow diagram below:
|
||||
|
||||
@ -24,7 +24,7 @@ Please see the flow diagram below:
|
||||
|
||||
### User-defined Command
|
||||
|
||||
Currently, users can define their own command words in the code. You can refer to the method of adding command words in `multinet_test.c`, there are already 20 commands pre-stored in `multinet_test.c`.
|
||||
Currently, users can define their own speech commands in the `menuconfig`. You can refer to the method of adding speech commands in menuconfig->Component config > ESP Speech Recognition->Add speech commands, there are already 20 commands pre-stored in sdkconfig.
|
||||
|
||||
|Command ID|Command|Command ID|Command|Command ID|Command|Command ID|Command|
|
||||
|:---:|:---:|:---:|:---:|:---:|:---:|:---:|:---:|
|
||||
@ -32,24 +32,33 @@ Currently, users can define their own command words in the code. You can refer t
|
||||
|1|关闭空调 (Turn on the air conditioner)|6|制热模式 (Heating mode)|11| 除湿模式 (Dehumidifying mode)|16| 暂停播放 (Pause playing)
|
||||
|2|增大风速 (Give me more wind)|7|制冷模式 (Cooling mode)|12| 关闭除湿模式 (Disable dehumidifying mode)|17| 定时一小时 (Set timer to 1 hour)
|
||||
|3|减少风速 (Give me less wind)|8|送风模式 (Ventilating mode)|13| 打开蓝牙 (Enable the Bluetooth)|18| 打开电灯 (Turn on the light)
|
||||
|4| 升高一度 (Increase by one degree)|9|节能模式 (Power-saving mode)|10| 关闭节能模式 (Disable power-saving mode)|19| 关闭电灯 (Turn off the light)
|
||||
|
||||
|4| 升高一度 (Increase by one degree)|9|节能模式 (Power-saving mode)|10| 关闭蓝牙 (Disable the Bluetooth)|19| 关闭电灯 (Turn off the light)
|
||||
|
||||
MultiNet supports user-defined commands. You can add your own commands to MultiNet. Note that the newly added command should obtain its command ID before it can be recognized by MultiNet.
|
||||
|
||||
### Add Speech Command
|
||||
|
||||
Users can define their own speech commands in the `menuconfig` in Pinyin, for example:
|
||||
|
||||
the command of “打开空调”, which means turn on the air conditioner, should be provided to the blank as "dai kai kong tiao".
|
||||
|
||||
- One speech commands ID can correspond to multiple speech command phrases;
|
||||
- Up to 100 speech commands ID or speech command phrases, including customized commands, are supported;
|
||||
- The corresponding multiple phrases in an ID need to be used ',' separated.
|
||||
|
||||
### Basic Configuration
|
||||
|
||||
Define the following two variables before using the command recognition model:
|
||||
|
||||
1. Model version
|
||||
1. Model version
|
||||
The model version has been configured in `menuconfig` to facilitate your development. Please configure in `menuconfig` and add the following line in your code:
|
||||
|
||||
`static const esp_mn_iface_t *multinet = &MULTINET_MODEL;`
|
||||
|
||||
2. Model parameter
|
||||
The language supported and the effectiveness of the model is determined by model parameters. Now only commands in Chinese are supported. Please configure the `MULTINET_COEFF` option in `menuconfig` and add the following line in your code to generate the model handle.
|
||||
2. Model parameter
|
||||
The language supported and the effectiveness of the model is determined by model parameters. Now only commands in Chinese are supported. Please configure the `MULTINET_COEFF` option in `menuconfig` and add the following line in your code to generate the model handle. The 6000 is the audio length for speech recognition, in ms, the range of sample_length is 0~6000.
|
||||
|
||||
`model_iface_data_t *model_data = multinet->create(&MULTINET_COEFF);`
|
||||
`model_iface_data_t *model_data = multinet->create(&MULTINET_COEFF, 6000);`
|
||||
|
||||
### API Reference
|
||||
|
||||
@ -68,7 +77,7 @@ Define the following two variables before using the command recognition model:
|
||||
**Parameter**
|
||||
|
||||
* coeff: The coefficient for speech commands recognition.
|
||||
* sample_length Audio length for speech recognition, in ms. The range of sample_length is 0~6000.
|
||||
* sample_length: Audio length for speech recognition, in ms. The range of sample_length is 0~6000.
|
||||
|
||||
**Return**
|
||||
|
||||
@ -86,15 +95,14 @@ Define the following two variables before using the command recognition model:
|
||||
|
||||
**Return**
|
||||
|
||||
The amount of samples to feed the detect function
|
||||
|
||||
The amount of samples to feed the detection function
|
||||
|
||||
|
||||
- `typedef int (*esp_mn_iface_op_get_samp_chunknum_t)(model_iface_data_t *model);`
|
||||
|
||||
**Definition**
|
||||
|
||||
Callback function type to fetch the number of frames recognized by the command word.
|
||||
Callback function type to fetch the number of frames recognized by the speech command.
|
||||
|
||||
**Parameter**
|
||||
|
||||
@ -102,7 +110,7 @@ Define the following two variables before using the command recognition model:
|
||||
|
||||
**Return**
|
||||
|
||||
The number of the frames recognized by the command word
|
||||
The number of the frames recognized by the speech command
|
||||
|
||||
- `typedef int (*esp_mn_iface_op_get_samp_rate_t)(model_iface_data_t *model);`
|
||||
|
||||
@ -118,37 +126,6 @@ Define the following two variables before using the command recognition model:
|
||||
|
||||
The sample rate, in Hz
|
||||
|
||||
- `typedef int (*esp_mn_iface_op_add_speech_commands_t)(model_iface_data_t *model, int command_id, char *phrase_spelling, char *phrase_str);`
|
||||
|
||||
**Definition**
|
||||
|
||||
Add a command word and set its command ID.
|
||||
|
||||
**Parameters**
|
||||
|
||||
* model: The model object to query
|
||||
|
||||
* command_id: The command ID of this word
|
||||
|
||||
* phrase_spelling: The speech command in Chinese spelled using prescribed rules
|
||||
|
||||
* phrase_str: Auxiliary information of words
|
||||
|
||||
**Return**
|
||||
|
||||
1: Setting success.
|
||||
|
||||
**Note**
|
||||
|
||||
The `phrase_spelling` is the mandarin syllables provided one by one in the form of **one Type A element** and **one Type B element**, which can be seen below:
|
||||
|
||||
* Type A element: `b bi c ch chu cu d di du f g gu h hu j ji ju k ku l li lu m mi n ni nu p pi q qi qu r ru s sh shu su t ti tu w
|
||||
x xi xu y yu z zhu zu`
|
||||
|
||||
* Type B element: `a ai an ang ao e ei en eng er i ie in ing iu o ong ou u ue ui un v ve`
|
||||
|
||||
For example, the Type A and Type B elements for "tiao" are "ti" and "ao", and the syllable "tiao" should provided to the API as "ti ao". Similarly, the command of "dai kai kong tiao", which means turn on the air conditioner, should be provided to the API as "d ai k ai k ong ti ao".
|
||||
|
||||
- `typedef float* (*esp_mn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);`
|
||||
|
||||
**Definition**
|
||||
|
||||
@ -50,7 +50,7 @@ wake word number = 1, word1 name = hilexin
|
||||
|
||||
### 1.3 Waking up the Board
|
||||
|
||||
Find the pre-defined wake-up word of the board in the printed log. In this example, the wake-up word is “Hi Lexin" [Ləsɪ:n].
|
||||
Find the pre-defined wake word of the board in the printed log. In this example, the wake word is “Hi Lexin" [Ləsɪ:n].
|
||||
|
||||
Then, say “Hi Lexin" ([Ləsɪ:n]) to wake up the board, which then wakes up and prints the following log:
|
||||
|
||||
@ -65,7 +65,7 @@ Then, the board enters the Listening status, waiting for new speech commands.
|
||||
|
||||
Currently, the MultiNet model already defined 20 speech commands, which can be seen in [MultiNet](README.md).
|
||||
|
||||
Now, you can give one speech command, for example, "turn on the air conditioner",
|
||||
Now, you can give one speech command, for example, “打开空调 (turn on the air conditioner)”,
|
||||
|
||||
* If this command exists in the supported speech command list, the board prints out the command id of this command in its log:
|
||||
|
||||
@ -76,12 +76,12 @@ Now, you can give one speech command, for example, "turn on the air conditioner"
|
||||
--------------END--------------
|
||||
|
||||
```
|
||||
* If this command does not exist in the supported speech command list, the board prints an error message of "can not recognize any speech commands" in its log:
|
||||
* If this command does not exist in the supported speech command list, the board prints an error message of "cannot recognize any speech commands" in its log:
|
||||
|
||||
|
||||
```
|
||||
-----------LISTENING-----------
|
||||
can not recognize any speech commands
|
||||
cannot recognize any speech commands
|
||||
--------------END--------------
|
||||
|
||||
```
|
||||
@ -118,11 +118,11 @@ For details on the initialization of the ESP32-LyraT-Mini board, please see code
|
||||
|
||||
If you want to choose other development boards other than ESP32-LyraT-Mini, please go to [esp-adf](https://github.com/espressif/esp-adf), which is Espressif's development framework for building audio applications based on ESP32 products, for more detailed information on hardware drivers.
|
||||
|
||||
### 2.2 Wake-up by Keyword
|
||||
### 2.2 Wake-up by Wake Word
|
||||
|
||||
The board enters the Waiting-for-wakeup status after waking up, during which the board will pick up audio data with the on-board microphone, and feed them to the **WakeNet** model frame by frame (30 ms, 16 KHz, 6 bit, mono).
|
||||
|
||||
Currently, you cannot customize wake-up word yourself. Therefore, please contact us for such requests.
|
||||
Currently, you cannot customize wake word yourself. Therefore, please contact us for such requests.
|
||||
|
||||
### 2.3 Recognizing Speech Commands
|
||||
|
||||
|
||||
@ -10,9 +10,11 @@
|
||||
* @brief Initialze a model instance with specified model coefficient.
|
||||
*
|
||||
* @param coeff The wakenet model coefficient.
|
||||
* @param coeff The wakenet model coefficient.
|
||||
* @parm sample_length Audio length for speech recognition, in ms. The range of sample_length is 0~6000.
|
||||
* @returns Handle to the model data.
|
||||
*/
|
||||
typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const model_coeff_getter_t *coeff);
|
||||
typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const model_coeff_getter_t *coeff, int sample_length);
|
||||
|
||||
|
||||
/**
|
||||
@ -26,6 +28,13 @@ typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const model_coeff_getter
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Callback function type to fetch the number of frames recognized by the command word
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The number of the frames recognized by the command word
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_get_samp_chunknum_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the detect function
|
||||
@ -35,20 +44,6 @@ typedef int (*esp_mn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Add a command word and set its command ID.
|
||||
*
|
||||
* @param model The model object to query.
|
||||
* @param command_id The command id of this word.
|
||||
* @param phrase_spelling The chinese command word spelled using prescribed rules.
|
||||
* @param phrase_str Auxiliary information of phrase.
|
||||
* @return 1: setting success. 0: setting failure
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_add_speech_commands_t)(model_iface_data_t *model,
|
||||
int command_id,
|
||||
char *phrase_spelling,
|
||||
char *phrase_str);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the speech recognition model and detect if there is a speech command found.
|
||||
*
|
||||
@ -72,7 +67,7 @@ typedef struct {
|
||||
esp_mn_iface_op_create_t create;
|
||||
esp_mn_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_mn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
||||
esp_mn_iface_op_add_speech_commands_t add_speech_commands;
|
||||
esp_mn_iface_op_get_samp_chunknum_t get_samp_chunknum;
|
||||
esp_mn_iface_op_detect_t detect;
|
||||
esp_mn_iface_op_destroy_t destroy;
|
||||
} esp_mn_iface_t;
|
||||
|
||||
103
speech_commands_recognition/include/mn_process_commands.h
Normal file
103
speech_commands_recognition/include/mn_process_commands.h
Normal file
@ -0,0 +1,103 @@
|
||||
#define SPEECH_COMMANDS_NUM CONFIG_SPEECH_COMMANDS_NUM
|
||||
#define MN_SPEECH_COMMAND_ID0 CONFIG_SPEECH_COMMAND_ID0
|
||||
#define MN_SPEECH_COMMAND_ID1 CONFIG_SPEECH_COMMAND_ID1
|
||||
#define MN_SPEECH_COMMAND_ID2 CONFIG_SPEECH_COMMAND_ID2
|
||||
#define MN_SPEECH_COMMAND_ID3 CONFIG_SPEECH_COMMAND_ID3
|
||||
#define MN_SPEECH_COMMAND_ID4 CONFIG_SPEECH_COMMAND_ID4
|
||||
#define MN_SPEECH_COMMAND_ID5 CONFIG_SPEECH_COMMAND_ID5
|
||||
#define MN_SPEECH_COMMAND_ID6 CONFIG_SPEECH_COMMAND_ID6
|
||||
#define MN_SPEECH_COMMAND_ID7 CONFIG_SPEECH_COMMAND_ID7
|
||||
#define MN_SPEECH_COMMAND_ID8 CONFIG_SPEECH_COMMAND_ID8
|
||||
#define MN_SPEECH_COMMAND_ID9 CONFIG_SPEECH_COMMAND_ID9
|
||||
#define MN_SPEECH_COMMAND_ID10 CONFIG_SPEECH_COMMAND_ID10
|
||||
#define MN_SPEECH_COMMAND_ID11 CONFIG_SPEECH_COMMAND_ID11
|
||||
#define MN_SPEECH_COMMAND_ID12 CONFIG_SPEECH_COMMAND_ID12
|
||||
#define MN_SPEECH_COMMAND_ID13 CONFIG_SPEECH_COMMAND_ID13
|
||||
#define MN_SPEECH_COMMAND_ID14 CONFIG_SPEECH_COMMAND_ID14
|
||||
#define MN_SPEECH_COMMAND_ID15 CONFIG_SPEECH_COMMAND_ID15
|
||||
#define MN_SPEECH_COMMAND_ID16 CONFIG_SPEECH_COMMAND_ID16
|
||||
#define MN_SPEECH_COMMAND_ID17 CONFIG_SPEECH_COMMAND_ID17
|
||||
#define MN_SPEECH_COMMAND_ID18 CONFIG_SPEECH_COMMAND_ID18
|
||||
#define MN_SPEECH_COMMAND_ID19 CONFIG_SPEECH_COMMAND_ID19
|
||||
#define MN_SPEECH_COMMAND_ID20 CONFIG_SPEECH_COMMAND_ID20
|
||||
#define MN_SPEECH_COMMAND_ID21 CONFIG_SPEECH_COMMAND_ID21
|
||||
#define MN_SPEECH_COMMAND_ID22 CONFIG_SPEECH_COMMAND_ID22
|
||||
#define MN_SPEECH_COMMAND_ID23 CONFIG_SPEECH_COMMAND_ID23
|
||||
#define MN_SPEECH_COMMAND_ID24 CONFIG_SPEECH_COMMAND_ID24
|
||||
#define MN_SPEECH_COMMAND_ID25 CONFIG_SPEECH_COMMAND_ID25
|
||||
#define MN_SPEECH_COMMAND_ID26 CONFIG_SPEECH_COMMAND_ID26
|
||||
#define MN_SPEECH_COMMAND_ID27 CONFIG_SPEECH_COMMAND_ID27
|
||||
#define MN_SPEECH_COMMAND_ID28 CONFIG_SPEECH_COMMAND_ID28
|
||||
#define MN_SPEECH_COMMAND_ID29 CONFIG_SPEECH_COMMAND_ID29
|
||||
#define MN_SPEECH_COMMAND_ID30 CONFIG_SPEECH_COMMAND_ID30
|
||||
#define MN_SPEECH_COMMAND_ID31 CONFIG_SPEECH_COMMAND_ID31
|
||||
#define MN_SPEECH_COMMAND_ID32 CONFIG_SPEECH_COMMAND_ID32
|
||||
#define MN_SPEECH_COMMAND_ID33 CONFIG_SPEECH_COMMAND_ID33
|
||||
#define MN_SPEECH_COMMAND_ID34 CONFIG_SPEECH_COMMAND_ID34
|
||||
#define MN_SPEECH_COMMAND_ID35 CONFIG_SPEECH_COMMAND_ID35
|
||||
#define MN_SPEECH_COMMAND_ID36 CONFIG_SPEECH_COMMAND_ID36
|
||||
#define MN_SPEECH_COMMAND_ID37 CONFIG_SPEECH_COMMAND_ID37
|
||||
#define MN_SPEECH_COMMAND_ID38 CONFIG_SPEECH_COMMAND_ID38
|
||||
#define MN_SPEECH_COMMAND_ID39 CONFIG_SPEECH_COMMAND_ID39
|
||||
#define MN_SPEECH_COMMAND_ID40 CONFIG_SPEECH_COMMAND_ID40
|
||||
#define MN_SPEECH_COMMAND_ID41 CONFIG_SPEECH_COMMAND_ID41
|
||||
#define MN_SPEECH_COMMAND_ID42 CONFIG_SPEECH_COMMAND_ID42
|
||||
#define MN_SPEECH_COMMAND_ID43 CONFIG_SPEECH_COMMAND_ID43
|
||||
#define MN_SPEECH_COMMAND_ID44 CONFIG_SPEECH_COMMAND_ID44
|
||||
#define MN_SPEECH_COMMAND_ID45 CONFIG_SPEECH_COMMAND_ID45
|
||||
#define MN_SPEECH_COMMAND_ID46 CONFIG_SPEECH_COMMAND_ID46
|
||||
#define MN_SPEECH_COMMAND_ID47 CONFIG_SPEECH_COMMAND_ID47
|
||||
#define MN_SPEECH_COMMAND_ID48 CONFIG_SPEECH_COMMAND_ID48
|
||||
#define MN_SPEECH_COMMAND_ID49 CONFIG_SPEECH_COMMAND_ID49
|
||||
#define MN_SPEECH_COMMAND_ID50 CONFIG_SPEECH_COMMAND_ID50
|
||||
#define MN_SPEECH_COMMAND_ID51 CONFIG_SPEECH_COMMAND_ID51
|
||||
#define MN_SPEECH_COMMAND_ID52 CONFIG_SPEECH_COMMAND_ID52
|
||||
#define MN_SPEECH_COMMAND_ID53 CONFIG_SPEECH_COMMAND_ID53
|
||||
#define MN_SPEECH_COMMAND_ID54 CONFIG_SPEECH_COMMAND_ID54
|
||||
#define MN_SPEECH_COMMAND_ID55 CONFIG_SPEECH_COMMAND_ID55
|
||||
#define MN_SPEECH_COMMAND_ID56 CONFIG_SPEECH_COMMAND_ID56
|
||||
#define MN_SPEECH_COMMAND_ID57 CONFIG_SPEECH_COMMAND_ID57
|
||||
#define MN_SPEECH_COMMAND_ID58 CONFIG_SPEECH_COMMAND_ID58
|
||||
#define MN_SPEECH_COMMAND_ID59 CONFIG_SPEECH_COMMAND_ID59
|
||||
#define MN_SPEECH_COMMAND_ID60 CONFIG_SPEECH_COMMAND_ID60
|
||||
#define MN_SPEECH_COMMAND_ID61 CONFIG_SPEECH_COMMAND_ID61
|
||||
#define MN_SPEECH_COMMAND_ID62 CONFIG_SPEECH_COMMAND_ID62
|
||||
#define MN_SPEECH_COMMAND_ID63 CONFIG_SPEECH_COMMAND_ID63
|
||||
#define MN_SPEECH_COMMAND_ID64 CONFIG_SPEECH_COMMAND_ID64
|
||||
#define MN_SPEECH_COMMAND_ID65 CONFIG_SPEECH_COMMAND_ID65
|
||||
#define MN_SPEECH_COMMAND_ID66 CONFIG_SPEECH_COMMAND_ID66
|
||||
#define MN_SPEECH_COMMAND_ID67 CONFIG_SPEECH_COMMAND_ID67
|
||||
#define MN_SPEECH_COMMAND_ID68 CONFIG_SPEECH_COMMAND_ID68
|
||||
#define MN_SPEECH_COMMAND_ID69 CONFIG_SPEECH_COMMAND_ID69
|
||||
#define MN_SPEECH_COMMAND_ID70 CONFIG_SPEECH_COMMAND_ID70
|
||||
#define MN_SPEECH_COMMAND_ID71 CONFIG_SPEECH_COMMAND_ID71
|
||||
#define MN_SPEECH_COMMAND_ID72 CONFIG_SPEECH_COMMAND_ID72
|
||||
#define MN_SPEECH_COMMAND_ID73 CONFIG_SPEECH_COMMAND_ID73
|
||||
#define MN_SPEECH_COMMAND_ID74 CONFIG_SPEECH_COMMAND_ID74
|
||||
#define MN_SPEECH_COMMAND_ID75 CONFIG_SPEECH_COMMAND_ID75
|
||||
#define MN_SPEECH_COMMAND_ID76 CONFIG_SPEECH_COMMAND_ID76
|
||||
#define MN_SPEECH_COMMAND_ID77 CONFIG_SPEECH_COMMAND_ID77
|
||||
#define MN_SPEECH_COMMAND_ID78 CONFIG_SPEECH_COMMAND_ID78
|
||||
#define MN_SPEECH_COMMAND_ID79 CONFIG_SPEECH_COMMAND_ID79
|
||||
#define MN_SPEECH_COMMAND_ID80 CONFIG_SPEECH_COMMAND_ID80
|
||||
#define MN_SPEECH_COMMAND_ID81 CONFIG_SPEECH_COMMAND_ID81
|
||||
#define MN_SPEECH_COMMAND_ID82 CONFIG_SPEECH_COMMAND_ID82
|
||||
#define MN_SPEECH_COMMAND_ID83 CONFIG_SPEECH_COMMAND_ID83
|
||||
#define MN_SPEECH_COMMAND_ID84 CONFIG_SPEECH_COMMAND_ID84
|
||||
#define MN_SPEECH_COMMAND_ID85 CONFIG_SPEECH_COMMAND_ID85
|
||||
#define MN_SPEECH_COMMAND_ID86 CONFIG_SPEECH_COMMAND_ID86
|
||||
#define MN_SPEECH_COMMAND_ID87 CONFIG_SPEECH_COMMAND_ID87
|
||||
#define MN_SPEECH_COMMAND_ID88 CONFIG_SPEECH_COMMAND_ID88
|
||||
#define MN_SPEECH_COMMAND_ID89 CONFIG_SPEECH_COMMAND_ID89
|
||||
#define MN_SPEECH_COMMAND_ID90 CONFIG_SPEECH_COMMAND_ID90
|
||||
#define MN_SPEECH_COMMAND_ID91 CONFIG_SPEECH_COMMAND_ID91
|
||||
#define MN_SPEECH_COMMAND_ID92 CONFIG_SPEECH_COMMAND_ID92
|
||||
#define MN_SPEECH_COMMAND_ID93 CONFIG_SPEECH_COMMAND_ID93
|
||||
#define MN_SPEECH_COMMAND_ID94 CONFIG_SPEECH_COMMAND_ID94
|
||||
#define MN_SPEECH_COMMAND_ID95 CONFIG_SPEECH_COMMAND_ID95
|
||||
#define MN_SPEECH_COMMAND_ID96 CONFIG_SPEECH_COMMAND_ID96
|
||||
#define MN_SPEECH_COMMAND_ID97 CONFIG_SPEECH_COMMAND_ID97
|
||||
#define MN_SPEECH_COMMAND_ID98 CONFIG_SPEECH_COMMAND_ID98
|
||||
#define MN_SPEECH_COMMAND_ID99 CONFIG_SPEECH_COMMAND_ID99
|
||||
|
||||
char *get_id_name(int i);
|
||||
Binary file not shown.
213
speech_commands_recognition/mn_process_commands.c
Normal file
213
speech_commands_recognition/mn_process_commands.c
Normal file
@ -0,0 +1,213 @@
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include "sdkconfig.h"
|
||||
#include "mn_process_commands.h"
|
||||
|
||||
char *get_id_name(int i)
|
||||
{
|
||||
// char command_phrase[128];
|
||||
if (i == 0)
|
||||
return MN_SPEECH_COMMAND_ID0;
|
||||
else if (i == 1)
|
||||
return MN_SPEECH_COMMAND_ID1;
|
||||
else if (i == 2)
|
||||
return MN_SPEECH_COMMAND_ID2;
|
||||
else if (i == 3)
|
||||
return MN_SPEECH_COMMAND_ID3;
|
||||
else if (i == 4)
|
||||
return MN_SPEECH_COMMAND_ID4;
|
||||
else if (i == 5)
|
||||
return MN_SPEECH_COMMAND_ID5;
|
||||
else if (i == 6)
|
||||
return MN_SPEECH_COMMAND_ID6;
|
||||
else if (i == 7)
|
||||
return MN_SPEECH_COMMAND_ID7;
|
||||
else if (i == 8)
|
||||
return MN_SPEECH_COMMAND_ID8;
|
||||
else if (i == 9)
|
||||
return MN_SPEECH_COMMAND_ID9;
|
||||
else if (i == 10)
|
||||
return MN_SPEECH_COMMAND_ID10;
|
||||
else if (i == 11)
|
||||
return MN_SPEECH_COMMAND_ID11;
|
||||
else if (i == 12)
|
||||
return MN_SPEECH_COMMAND_ID12;
|
||||
else if (i == 13)
|
||||
return MN_SPEECH_COMMAND_ID13;
|
||||
else if (i == 14)
|
||||
return MN_SPEECH_COMMAND_ID14;
|
||||
else if (i == 15)
|
||||
return MN_SPEECH_COMMAND_ID15;
|
||||
else if (i == 16)
|
||||
return MN_SPEECH_COMMAND_ID16;
|
||||
else if (i == 17)
|
||||
return MN_SPEECH_COMMAND_ID17;
|
||||
else if (i == 18)
|
||||
return MN_SPEECH_COMMAND_ID18;
|
||||
else if (i == 19)
|
||||
return MN_SPEECH_COMMAND_ID19;
|
||||
else if (i == 20)
|
||||
return MN_SPEECH_COMMAND_ID20;
|
||||
else if (i == 21)
|
||||
return MN_SPEECH_COMMAND_ID21;
|
||||
else if (i == 22)
|
||||
return MN_SPEECH_COMMAND_ID22;
|
||||
else if (i == 23)
|
||||
return MN_SPEECH_COMMAND_ID23;
|
||||
else if (i == 24)
|
||||
return MN_SPEECH_COMMAND_ID24;
|
||||
else if (i == 25)
|
||||
return MN_SPEECH_COMMAND_ID25;
|
||||
else if (i == 26)
|
||||
return MN_SPEECH_COMMAND_ID26;
|
||||
else if (i == 27)
|
||||
return MN_SPEECH_COMMAND_ID27;
|
||||
else if (i == 28)
|
||||
return MN_SPEECH_COMMAND_ID28;
|
||||
else if (i == 29)
|
||||
return MN_SPEECH_COMMAND_ID29;
|
||||
else if (i == 30)
|
||||
return MN_SPEECH_COMMAND_ID30;
|
||||
else if (i == 31)
|
||||
return MN_SPEECH_COMMAND_ID31;
|
||||
else if (i == 32)
|
||||
return MN_SPEECH_COMMAND_ID32;
|
||||
else if (i == 33)
|
||||
return MN_SPEECH_COMMAND_ID33;
|
||||
else if (i == 34)
|
||||
return MN_SPEECH_COMMAND_ID34;
|
||||
else if (i == 35)
|
||||
return MN_SPEECH_COMMAND_ID35;
|
||||
else if (i == 36)
|
||||
return MN_SPEECH_COMMAND_ID36;
|
||||
else if (i == 37)
|
||||
return MN_SPEECH_COMMAND_ID37;
|
||||
else if (i == 38)
|
||||
return MN_SPEECH_COMMAND_ID38;
|
||||
else if (i == 39)
|
||||
return MN_SPEECH_COMMAND_ID39;
|
||||
else if (i == 40)
|
||||
return MN_SPEECH_COMMAND_ID40;
|
||||
else if (i == 41)
|
||||
return MN_SPEECH_COMMAND_ID41;
|
||||
else if (i == 42)
|
||||
return MN_SPEECH_COMMAND_ID42;
|
||||
else if (i == 43)
|
||||
return MN_SPEECH_COMMAND_ID43;
|
||||
else if (i == 44)
|
||||
return MN_SPEECH_COMMAND_ID44;
|
||||
else if (i == 45)
|
||||
return MN_SPEECH_COMMAND_ID45;
|
||||
else if (i == 46)
|
||||
return MN_SPEECH_COMMAND_ID46;
|
||||
else if (i == 47)
|
||||
return MN_SPEECH_COMMAND_ID47;
|
||||
else if (i == 48)
|
||||
return MN_SPEECH_COMMAND_ID48;
|
||||
else if (i == 49)
|
||||
return MN_SPEECH_COMMAND_ID49;
|
||||
else if (i == 50)
|
||||
return MN_SPEECH_COMMAND_ID50;
|
||||
else if (i == 51)
|
||||
return MN_SPEECH_COMMAND_ID51;
|
||||
else if (i == 52)
|
||||
return MN_SPEECH_COMMAND_ID52;
|
||||
else if (i == 53)
|
||||
return MN_SPEECH_COMMAND_ID53;
|
||||
else if (i == 54)
|
||||
return MN_SPEECH_COMMAND_ID54;
|
||||
else if (i == 55)
|
||||
return MN_SPEECH_COMMAND_ID55;
|
||||
else if (i == 56)
|
||||
return MN_SPEECH_COMMAND_ID56;
|
||||
else if (i == 57)
|
||||
return MN_SPEECH_COMMAND_ID57;
|
||||
else if (i == 58)
|
||||
return MN_SPEECH_COMMAND_ID58;
|
||||
else if (i == 59)
|
||||
return MN_SPEECH_COMMAND_ID59;
|
||||
else if (i == 60)
|
||||
return MN_SPEECH_COMMAND_ID60;
|
||||
else if (i == 61)
|
||||
return MN_SPEECH_COMMAND_ID61;
|
||||
else if (i == 62)
|
||||
return MN_SPEECH_COMMAND_ID62;
|
||||
else if (i == 63)
|
||||
return MN_SPEECH_COMMAND_ID63;
|
||||
else if (i == 64)
|
||||
return MN_SPEECH_COMMAND_ID64;
|
||||
else if (i == 65)
|
||||
return MN_SPEECH_COMMAND_ID65;
|
||||
else if (i == 66)
|
||||
return MN_SPEECH_COMMAND_ID66;
|
||||
else if (i == 67)
|
||||
return MN_SPEECH_COMMAND_ID67;
|
||||
else if (i == 68)
|
||||
return MN_SPEECH_COMMAND_ID68;
|
||||
else if (i == 69)
|
||||
return MN_SPEECH_COMMAND_ID69;
|
||||
else if (i == 70)
|
||||
return MN_SPEECH_COMMAND_ID70;
|
||||
else if (i == 71)
|
||||
return MN_SPEECH_COMMAND_ID71;
|
||||
else if (i == 72)
|
||||
return MN_SPEECH_COMMAND_ID72;
|
||||
else if (i == 73)
|
||||
return MN_SPEECH_COMMAND_ID73;
|
||||
else if (i == 74)
|
||||
return MN_SPEECH_COMMAND_ID74;
|
||||
else if (i == 75)
|
||||
return MN_SPEECH_COMMAND_ID75;
|
||||
else if (i == 76)
|
||||
return MN_SPEECH_COMMAND_ID76;
|
||||
else if (i == 77)
|
||||
return MN_SPEECH_COMMAND_ID77;
|
||||
else if (i == 78)
|
||||
return MN_SPEECH_COMMAND_ID78;
|
||||
else if (i == 79)
|
||||
return MN_SPEECH_COMMAND_ID79;
|
||||
else if (i == 80)
|
||||
return MN_SPEECH_COMMAND_ID80;
|
||||
else if (i == 81)
|
||||
return MN_SPEECH_COMMAND_ID81;
|
||||
else if (i == 82)
|
||||
return MN_SPEECH_COMMAND_ID82;
|
||||
else if (i == 83)
|
||||
return MN_SPEECH_COMMAND_ID83;
|
||||
else if (i == 84)
|
||||
return MN_SPEECH_COMMAND_ID84;
|
||||
else if (i == 85)
|
||||
return MN_SPEECH_COMMAND_ID85;
|
||||
else if (i == 86)
|
||||
return MN_SPEECH_COMMAND_ID86;
|
||||
else if (i == 87)
|
||||
return MN_SPEECH_COMMAND_ID87;
|
||||
else if (i == 88)
|
||||
return MN_SPEECH_COMMAND_ID88;
|
||||
else if (i == 89)
|
||||
return MN_SPEECH_COMMAND_ID89;
|
||||
else if (i == 90)
|
||||
return MN_SPEECH_COMMAND_ID90;
|
||||
else if (i == 91)
|
||||
return MN_SPEECH_COMMAND_ID91;
|
||||
else if (i == 92)
|
||||
return MN_SPEECH_COMMAND_ID92;
|
||||
else if (i == 93)
|
||||
return MN_SPEECH_COMMAND_ID93;
|
||||
else if (i == 94)
|
||||
return MN_SPEECH_COMMAND_ID94;
|
||||
else if (i == 95)
|
||||
return MN_SPEECH_COMMAND_ID95;
|
||||
else if (i == 96)
|
||||
return MN_SPEECH_COMMAND_ID96;
|
||||
else if (i == 97)
|
||||
return MN_SPEECH_COMMAND_ID97;
|
||||
else if (i == 98)
|
||||
return MN_SPEECH_COMMAND_ID98;
|
||||
else if (i == 99)
|
||||
return MN_SPEECH_COMMAND_ID99;
|
||||
else
|
||||
return NULL;
|
||||
}
|
||||
|
||||
@ -1,14 +1,14 @@
|
||||
# Espressif Speech Wake Words Customization Process
|
||||
# Espressif Speech Wake Word Customization Process
|
||||
|
||||
#### Offline Wake Words Customization
|
||||
#### Offline Wake Word Customization
|
||||
|
||||
Espressif provides users with the **Off-line Wake Words Customization** service, which allows users to use both publicly available Wake Wordss (such as "Hi Lexin", "Alexa", and "Espressif") and customized Wake Wordss.
|
||||
Espressif provides users with the **Off-line Wake Word Customization** service, which allows users to use both publicly available Wake Words (such as "Hi Lexin", "Alexa", and "Espressif") and customized Wake Words.
|
||||
|
||||
1. If you want to use publicly available Wake Wordss for commercial use,
|
||||
1. If you want to use publicly available Wake Words for commercial use,
|
||||
- please check the Wake Words provided in ADF/ASR Demos;
|
||||
- We will continue to provide more and more Wake Wordss that are free for commercial use.
|
||||
- We will continue to provide more and more Wake Words that are free for commercial use.
|
||||
|
||||
2. If you want to use your own wake words, we can also provide the **Off-line Wake Words Customization** service.
|
||||
2. If you want to use your own wake words, we can also provide the **Off-line Wake Word Customization** service.
|
||||
- If you are able to provide a training corpus meeting the requirements described in the following **Requirements on Corpus**.
|
||||
- We need two to three weeks for training and optimization.
|
||||
- Service fee will be charged by Espressif in this case.
|
||||
@ -21,9 +21,9 @@ Espressif provides users with the **Off-line Wake Words Customization** service,
|
||||
- For details on the fee and time required for customization, please email us at [sales@espressif.com](sales@espressif.com).
|
||||
- We will agree on a reasonable plan based on how many wake words for customization and how large is your scale of product production.
|
||||
|
||||
3. About Espressif Wake Words Model
|
||||
- Now, a single wake words model can recognize up to five Wake Wordss
|
||||
- Normally, each Wake Words contains three to six syllables, such as "Hi Le xin" (3 syllables), “Alexa” (3 syllables), "小爱同学" (4 syllables).
|
||||
3. About Espressif Wake Word Model
|
||||
- Now, a single wake word model can recognize up to five Wake Words
|
||||
- Normally, each Wake Word contains three to six syllables, such as "Hi Le xin" (3 syllables), “Alexa” (3 syllables), "小爱同学" (4 syllables).
|
||||
- Several wake words can be used in combination based on your actual requirement.
|
||||
|
||||
#### Requirements on Corpus Texts
|
||||
@ -40,12 +40,12 @@ You can provide us your training corpus by preparing it yourself or purchasing o
|
||||
- Sample size: no less than 500 people, among which,
|
||||
- The number of males and females should be similar;
|
||||
- The number of people in different age-group should be similar;
|
||||
- The number of Children should be larger than 100 (If the child is one of your target users).
|
||||
- The number of children should be larger than 100 (If children are one of your target users).
|
||||
- Environment:
|
||||
- It's advise to collect your sample with a Hi-Fi microphone in a professional audio room, with an ambient noise lower than 40 dB.
|
||||
- Each participant should
|
||||
- Position himself/herself at a distance of one meter from the microphone, and repeat the Wake Wordss for 15 times (5 times fast, 10 times normal);
|
||||
- Position himself/herself at a distance of three meters from the microphone, and repeat the Wake Wordss for 15 times (5 times fast, 10 times normal);
|
||||
- Position himself/herself at a distance of one meter from the microphone, and repeat the Wake Word for 15 times (5 times fast, 10 times normal);
|
||||
- Position himself/herself at a distance of three meters from the microphone, and repeat the Wake Word for 15 times (5 times fast, 10 times normal);
|
||||
- The naming of sample file should reflect the sex, age, and speech speed of the sample himself/herself. An example for naming your sample file is `female_age_fast_id.wav`. Or you can provide a separate form to record these information.
|
||||
|
||||
#### Hardware Design and Test
|
||||
@ -71,7 +71,7 @@ You can provide us your training corpus by preparing it yourself or purchasing o
|
||||
- Test the performance of the **Acoustic Echo Cancellation** algorithm
|
||||
- Test the performance of the **Speech Enhancement** algorithm
|
||||
|
||||
3. After you hardware design, it's advised to **send** 1 or 2 pieces of your hardware, so we can optimize its performance for wake words detection on a whole product level.
|
||||
3. After your hardware design, it's advised to **send** 1 or 2 pieces of your hardware, so we can optimize its performance for wake word detection on a whole product level.
|
||||
|
||||
|
||||
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# WakeNet
|
||||
|
||||
WakeNet, which is a wake words engine built upon neural network, is specially designed for low-power embedded MCUs. Now, the WakeNet model supports up to 5 wake wordss.
|
||||
WakeNet, which is a wake word engine built upon neural network, is specially designed for low-power embedded MCUs. Now, the WakeNet model supports up to 5 wake words.
|
||||
|
||||
## Overview
|
||||
|
||||
@ -14,7 +14,7 @@ Please see the flow diagram of WakeNet below:
|
||||
The WakeNet uses [MFCC](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum) to obtain the features of the input audio clip (16 KHz, 16 bit, single track). The window width and step width of each frame of the audio clip are both 30 ms.
|
||||
|
||||
- Neural Network:
|
||||
Now, the natural network structure has been updated to the sixth edition, among which,
|
||||
Now, the neural network structure has been updated to the sixth edition, among which,
|
||||
- WakeNet1 and WakeNet2 had been out of use.
|
||||
- WakeNet3 and WakeNet4 are built upon the [CRNN](https://arxiv.org/abs/1703.05390) structure.
|
||||
- WakeNet5 and WakeNet6 are built upon the [Dilated Convolution](https://arxiv.org/pdf/1609.03499.pdf) structure.
|
||||
@ -71,18 +71,17 @@ Please see the flow diagram of WakeNet below:
|
||||
|Quantized WakeNet3|26 K|20 KB|29 ms|90 ms|
|
||||
|Quantised WakeNet4|53 K|22 KB|48 ms|90 ms|
|
||||
|Quantised WakeNet5|41 K|15 KB|7 ms|30 ms|
|
||||
|Quantised WakeNet6|41 K|20 KB|9 ms|30 ms|
|
||||
|
||||
### 2. Performance
|
||||
|
||||
|Distance| Quiet | Stationary Noise (SNR = 0 ~ 10 dB)| Speech Noise (SNR = 0 ~ 10 dB)| AEC Interruption (-5 ~ -15 dB)|
|
||||
|Distance| Quiet | Stationary Noise (SNR = 5 ~ 10 dB)| Speech Noise (SNR = 5 ~ 10 dB)| AEC Interruption (-5 ~ -10 dB)|
|
||||
|:---:|:---:|:---:|:---:|:---:|
|
||||
|1 m|97%|90%|88%|89%|
|
||||
|3 m|95%|85%|75%|73%|
|
||||
|
||||
False triggering rate: 1 time in 20 hours
|
||||
|
||||
**Note**: We use the ESP32-LyraT-Mini development board and the WakeNet6 model in our test. The performance is limited because ESP32-LyraT-Mini only has one microphone. We expect a better recognition performance when more microphones are involved in the test.
|
||||
**Note**: We use the ESP32-LyraT-Mini development board and the WakeNet5 model in our test. The performance is limited because ESP32-LyraT-Mini only has one microphone. We expect a better recognition performance when more microphones are involved in the test.
|
||||
|
||||
## Wake Word Customization
|
||||
|
||||
|
||||
@ -7,6 +7,7 @@
|
||||
extern const esp_wn_iface_t esp_sr_wakenet3_quantized;
|
||||
extern const esp_wn_iface_t esp_sr_wakenet4_quantized;
|
||||
extern const esp_wn_iface_t esp_sr_wakenet5_quantized;
|
||||
extern const esp_wn_iface_t esp_sr_wakenet5_float;
|
||||
extern const esp_wn_iface_t esp_sr_wakenet6_quantized;
|
||||
|
||||
/*
|
||||
@ -16,6 +17,8 @@ extern const esp_wn_iface_t esp_sr_wakenet6_quantized;
|
||||
#define WAKENET_MODEL esp_sr_wakenet3_quantized
|
||||
#elif CONFIG_SR_MODEL_WN4_QUANT
|
||||
#define WAKENET_MODEL esp_sr_wakenet4_quantized
|
||||
#elif CONFIG_SR_MODEL_WN5_FLOAT
|
||||
#define WAKENET_MODEL esp_sr_wakenet5_float
|
||||
#elif CONFIG_SR_MODEL_WN5_QUANT
|
||||
#define WAKENET_MODEL esp_sr_wakenet5_quantized
|
||||
#elif CONFIG_SR_MODEL_WN6_QUANT
|
||||
@ -35,13 +38,17 @@ extern const esp_wn_iface_t esp_sr_wakenet6_quantized;
|
||||
#include "hilexin_wn4.h"
|
||||
#define WAKENET_COEFF get_coeff_hilexin_wn4
|
||||
|
||||
#elif CONFIG_SR_WN5_HILEXIN
|
||||
#elif CONFIG_SR_WN5_HILEXIN & CONFIG_SR_MODEL_WN5_FLOAT
|
||||
#include "hilexin_wn5_float.h"
|
||||
#define WAKENET_COEFF get_coeff_hilexin_wn5_float
|
||||
|
||||
#elif CONFIG_SR_WN5_HILEXIN & CONFIG_SR_MODEL_WN5_QUANT
|
||||
#include "hilexin_wn5.h"
|
||||
#define WAKENET_COEFF get_coeff_hilexin_wn5
|
||||
|
||||
#elif CONFIG_SR_WN6_HILEXIN
|
||||
#include "hilexin_wn6.h"
|
||||
#define WAKENET_COEFF get_coeff_hilexin_wn6
|
||||
#include "nihaoxiaoxin_wn6.h"
|
||||
#define WAKENET_COEFF get_coeff_nihaoxiaoxin_wn6
|
||||
|
||||
#elif CONFIG_SR_WN5_CUSTOMIZED_WORD
|
||||
#include "customized_word_wn5.h"
|
||||
|
||||
8
wake_words_engine/include/nihaoxiaoxin_wn6.h
Normal file
8
wake_words_engine/include/nihaoxiaoxin_wn6.h
Normal file
@ -0,0 +1,8 @@
|
||||
//Generated by mkmodel
|
||||
#pragma once
|
||||
#include <string.h>
|
||||
#include "dl_lib_coefgetter_if.h"
|
||||
#include "dl_lib_matrix.h"
|
||||
#include "dl_lib_matrixq.h"
|
||||
|
||||
extern const model_coeff_getter_t get_coeff_nihaoxiaoxin_wn6;
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
wake_words_engine/libnihaoxiaoxin_wn6.a
Normal file
BIN
wake_words_engine/libnihaoxiaoxin_wn6.a
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user