feat(MN): Support continuous recognition

This commit is contained in:
Wang Wang Wang 2022-01-19 16:13:04 +08:00
parent d05cf97972
commit 4d3e550e72
19 changed files with 97 additions and 36 deletions

View File

@ -108,6 +108,10 @@ choice SR_WN_WAKE_WORD_SEL
config SR_WN_WN7_HILEXIN
bool "Hi,Lexin (WakeNet7)"
depends on SR_WN_MODEL_WN7_QUANT || SR_WN_MODEL_WN7_QUANT8
config SR_WN_WN8_HILEXIN
bool "Hi,Lexin (WakeNet8)"
depends on SR_WN_MODEL_WN8_QUANT
config SR_WN_WN8_ALEXA
bool "Alexa (WakeNet8)"
@ -177,7 +181,6 @@ choice SR_MN_MODE_SEL
config SR_MN_CN_MULTINET4_5_SINGLE_RECOGNITION
bool "chinese single recognition (MultiNet4.5)"
depends on SR_MN_CHINESE && IDF_TARGET_ESP32S3
endchoice
menu "Add speech commands"

View File

@ -2,6 +2,7 @@
#include "stdint.h"
#include "esp_wn_iface.h"
#include "esp_wn_models.h"
#include "esp_vad.h"
//AFE: Audio Front-End
//SR: Speech Recognition
@ -18,33 +19,48 @@ typedef enum {
// the output state of fetch function
typedef enum {
AFE_FETCH_CHANNEL_VERIFIED = -2, // wwe state: output channel is verified
AFE_FETCH_NOISE = -1, // vad state: noise or silence
AFE_FETCH_SPEECH = 0, // vad state: speech
AFE_FETCH_WWE_DETECTED = 1 // wwe state: wake word is detected
AFE_FETCH_ERROR = -3, // fetch empty data, retry it
AFE_FETCH_CHANNEL_VERIFIED = -2, // wwe state: output channel is verified
AFE_FETCH_NOISE = -1, // vad state: noise or silence
AFE_FETCH_SPEECH = 0, // vad state: speech
AFE_FETCH_WWE_DETECTED = 1 // wwe state: wake word is detected
} afe_fetch_mode_t;
typedef enum {
AFE_PSRAM_LOW_COST = 1,
AFE_PSRAM_MEDIA_COST = 2,
AFE_PSRAM_HIGH_COST = 3
AFE_PSRAM_LOW_COST = 0,
AFE_PSRAM_MIDDLE_COST = 1,
AFE_PSRAM_HIGH_COST = 2
} afe_use_psram_mode_t;
typedef enum {
AFE_MN_PEAK_AGC_MODE_1 = -5, // The peak amplitude of audio fed to multinet is -5dB
AFE_MN_PEAK_AGC_MODE_2 = -4, // The peak amplitude of audio fed to multinet is -4dB
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of audio fed to multinet is -3dB
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
} afe_mn_peak_agc_mode_t;
typedef struct {
int total_ch_num; // total channel num. It must be: total_ch_num = mic_num + ref_num
int mic_num; // mic channel num
int ref_num; // reference channel num
} afe_pcm_config_t;
typedef struct {
bool aec_init;
bool se_init;
bool vad_init;
bool wakenet_init;
int vad_mode;
const esp_wn_iface_t *wakenet_model;
const model_coeff_getter_t *wakenet_coeff;
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
esp_wn_iface_t *wakenet_model;
model_coeff_getter_t *wakenet_coeff;
det_mode_t wakenet_mode;
afe_sr_mode_t afe_mode;
int afe_perferred_core;
int afe_perferred_priority;
int afe_ringbuf_size;
int alloc_from_psram;
int agc_mode;
afe_use_psram_mode_t alloc_from_psram;
afe_mn_peak_agc_mode_t agc_mode;
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
} afe_config_t;
@ -54,7 +70,7 @@ typedef struct {
.se_init = true, \
.vad_init = true, \
.wakenet_init = true, \
.vad_mode = 3, \
.vad_mode = VAD_MODE_3, \
.wakenet_model = &WAKENET_MODEL, \
.wakenet_coeff = &WAKENET_COEFF, \
.wakenet_mode = DET_MODE_90, \
@ -62,8 +78,11 @@ typedef struct {
.afe_perferred_core = 0, \
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.alloc_from_psram = 1, \
.agc_mode = 2, \
.alloc_from_psram = AFE_PSRAM_MIDDLE_COST, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config.total_ch_num = 2, \
.pcm_config.mic_num = 1, \
.pcm_config.ref_num = 1, \
}
#elif CONFIG_IDF_TARGET_ESP32S3
#define AFE_CONFIG_DEFAULT() { \
@ -71,7 +90,7 @@ typedef struct {
.se_init = true, \
.vad_init = true, \
.wakenet_init = true, \
.vad_mode = 3, \
.vad_mode = VAD_MODE_3, \
.wakenet_model = &WAKENET_MODEL, \
.wakenet_coeff = &WAKENET_COEFF, \
.wakenet_mode = DET_MODE_2CH_90, \
@ -79,8 +98,11 @@ typedef struct {
.afe_perferred_core = 0, \
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.alloc_from_psram = AFE_PSRAM_MEDIA_COST, \
.agc_mode = 2, \
.alloc_from_psram = AFE_PSRAM_HIGH_COST, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config.total_ch_num = 3, \
.pcm_config.mic_num = 2, \
.pcm_config.ref_num = 1, \
}
#endif
/**
@ -113,10 +135,18 @@ typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_confi
typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
/**
* @brief Get the channel number of samples that need to be passed to the fetch function
* @brief Get the total channel number which be config
*
* @param afe The AFE_SR object to query
* @return The amount of samples to feed the fetch function
* @return The amount of total channels
*/
typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe);
/**
* @brief Get the mic channel number which be config
*
* @param afe The AFE_SR object to query
* @return The amount of mic channels
*/
typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
@ -232,6 +262,7 @@ typedef struct {
esp_afe_sr_iface_op_fetch_t fetch;
esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
esp_afe_sr_iface_op_get_total_channel_num_t get_total_channel_num;
esp_afe_sr_iface_op_get_channel_num_t get_channel_num;
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
@ -243,3 +274,5 @@ typedef struct {
esp_afe_sr_iface_op_enable_se_t enable_se;
esp_afe_sr_iface_op_destroy_t destroy;
} esp_afe_sr_iface_t;
extern esp_afe_sr_iface_t esp_afe_sr;

View File

@ -1,20 +1,33 @@
#pragma once
#include "stdint.h"
// #include "esp_err.h"
#include "dl_lib_coefgetter_if.h"
#include "esp_wn_iface.h"
// //Opaque model data container
// typedef struct model_iface_data_t model_iface_data_t;
// Return all possible recognition results
#define ESP_MN_RESULT_MAX_NUM 5
typedef enum {
ESP_MN_STATE_DETECTING = 0, // detecting
ESP_MN_STATE_DETECTED = 1, // detected
ESP_MN_STATE_TIMEOUT = 2, // time out
} esp_mn_state_t;
typedef struct{
esp_mn_state_t state;
int num; // The number of phrase in list, num<=5. When num=0, no phrase is recognized.
int command_id[ESP_MN_RESULT_MAX_NUM]; // The list of command id.
int phrase_id[ESP_MN_RESULT_MAX_NUM]; // The list of phrase id.
float prob[ESP_MN_RESULT_MAX_NUM]; // The list of probability.
} esp_mn_results_t;
/**
* @brief Initialze a model instance with specified model coefficient.
*
* @param coeff The wakenet model coefficient.
* @param coeff The wakenet model coefficient.
* @param duration The duration (ms) to trigger the timeout
* @parm sample_length Audio length for speech recognition, in ms.
* @returns Handle to the model data.
*/
typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const model_coeff_getter_t *coeff, int sample_length);
typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const model_coeff_getter_t *coeff, int duration);
/**
@ -96,16 +109,22 @@ typedef int (*esp_mn_iface_op_get_det_phrase_id_t)(model_iface_data_t *model);
*/
typedef void (*esp_mn_iface_op_destroy_t)(model_iface_data_t *model);
/**
* @brief Reset the speech commands
*
* @param model_data The model object to query.
* @param command_str The string of new commands.
* @param err_phrase_id Wrong phrase ID string.
* @brief Reset the speech commands recognition model
*
*/
typedef void (*esp_mn_iface_op_reset_t)(model_iface_data_t *model_data, char *command_str, char *err_phrase_id);
/**
* @brief Get recognition results
*
* @param model The Model object to destroy
*
* @return The current results.
*/
typedef esp_mn_results_t* (*esp_mn_iface_op_get_results_t)(model_iface_data_t *model);
/**
* @brief Reset the speech commands recognition model
*
@ -134,6 +153,7 @@ typedef struct {
esp_mn_iface_op_detect_t detect;
esp_mn_iface_op_destroy_t destroy;
esp_mn_iface_op_reset_t reset;
esp_mn_iface_op_get_results_t get_results;
esp_mn_iface_op_wakenet_reset_t wakenet_reset;
esp_mn_iface_op_close_log_t close_log;
} esp_mn_iface_t;

View File

@ -96,6 +96,9 @@ extern const esp_wn_iface_t esp_sr_wakenet8_quantized8;
#elif CONFIG_SR_WN_WN8_HIESP & CONFIG_SR_WN_MODEL_WN8_QUANT8
#define WAKENET_COEFF "hiesp8q8"
#elif CONFIG_SR_WN_WN8_HILEXIN & CONFIG_SR_WN_MODEL_WN8_QUANT
#define WAKENET_COEFF "hilexin8"
#else
#error No valid wake word selected.
#endif

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1 +1 @@
4e7ee1f3d6dbe62bf556d209d69fb331dbacc72a
c008766c5e30abbd2e418086a688de59359ff1df

View File

@ -48,6 +48,8 @@ elif "CONFIG_SR_WN_WN7_ALEXA" in WN_STRING and "CONFIG_SR_WN_MODEL_WN7_QUANT8" i
wakenet_model = 'alexa7q8'
elif "CONFIG_SR_WN_WN7_ALEXA" in WN_STRING and "CONFIG_SR_WN_MODEL_WN7_QUANT" in WN_STRING:
wakenet_model = 'alexa7'
elif "CONFIG_SR_WN_WN8_HILEXIN" in WN_STRING and "CONFIG_SR_WN_MODEL_WN8_QUANT" in WN_STRING:
wakenet_model = 'hilexin8'
elif "CONFIG_SR_WN_WN8_ALEXA" in WN_STRING and "CONFIG_SR_WN_MODEL_WN8_QUANT" in WN_STRING:
wakenet_model = 'alexa8'
elif "CONFIG_SR_WN_WN8_HIESP" in WN_STRING and "CONFIG_SR_WN_MODEL_WN8_QUANT8" in WN_STRING:
@ -75,7 +77,6 @@ print(wakenet_model)
print(multinet_model)
target_model = model_path + '/target'
if os.path.exists(target_model):
shutil.rmtree(target_model)
os.makedirs(target_model)

View File

@ -1 +1 @@
MN5Q8_v1_english_8_0.9_0.90
MN5Q8_v2_english_8_0.9_0.90

View File

@ -1 +1 @@
wakeNet8_v5h8_alexa_5_0.57_0.59
wakeNet8_v5_alexa_5_0.55_0.54

View File

@ -1 +1 @@
WakeNet8_v3h8_hiesp_5_0.59_0.60
WakeNet8_v3h8_hiesp_5_0.60_0.616

Binary file not shown.

View File

@ -0,0 +1 @@
WakeNet8_v3h8_hilexin_5_0.625_0.635

Binary file not shown.

Binary file not shown.