mirror of
https://github.com/espressif/esp-sr.git
synced 2025-09-15 15:28:44 +08:00
feat(MN): Support continuous recognition
This commit is contained in:
parent
d05cf97972
commit
4d3e550e72
@ -108,6 +108,10 @@ choice SR_WN_WAKE_WORD_SEL
|
||||
config SR_WN_WN7_HILEXIN
|
||||
bool "Hi,Lexin (WakeNet7)"
|
||||
depends on SR_WN_MODEL_WN7_QUANT || SR_WN_MODEL_WN7_QUANT8
|
||||
|
||||
config SR_WN_WN8_HILEXIN
|
||||
bool "Hi,Lexin (WakeNet8)"
|
||||
depends on SR_WN_MODEL_WN8_QUANT
|
||||
|
||||
config SR_WN_WN8_ALEXA
|
||||
bool "Alexa (WakeNet8)"
|
||||
@ -177,7 +181,6 @@ choice SR_MN_MODE_SEL
|
||||
config SR_MN_CN_MULTINET4_5_SINGLE_RECOGNITION
|
||||
bool "chinese single recognition (MultiNet4.5)"
|
||||
depends on SR_MN_CHINESE && IDF_TARGET_ESP32S3
|
||||
|
||||
endchoice
|
||||
|
||||
menu "Add speech commands"
|
||||
|
||||
@ -2,6 +2,7 @@
|
||||
#include "stdint.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#include "esp_wn_models.h"
|
||||
#include "esp_vad.h"
|
||||
|
||||
//AFE: Audio Front-End
|
||||
//SR: Speech Recognition
|
||||
@ -18,33 +19,48 @@ typedef enum {
|
||||
|
||||
// the output state of fetch function
|
||||
typedef enum {
|
||||
AFE_FETCH_CHANNEL_VERIFIED = -2, // wwe state: output channel is verified
|
||||
AFE_FETCH_NOISE = -1, // vad state: noise or silence
|
||||
AFE_FETCH_SPEECH = 0, // vad state: speech
|
||||
AFE_FETCH_WWE_DETECTED = 1 // wwe state: wake word is detected
|
||||
AFE_FETCH_ERROR = -3, // fetch empty data, retry it
|
||||
AFE_FETCH_CHANNEL_VERIFIED = -2, // wwe state: output channel is verified
|
||||
AFE_FETCH_NOISE = -1, // vad state: noise or silence
|
||||
AFE_FETCH_SPEECH = 0, // vad state: speech
|
||||
AFE_FETCH_WWE_DETECTED = 1 // wwe state: wake word is detected
|
||||
} afe_fetch_mode_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_PSRAM_LOW_COST = 1,
|
||||
AFE_PSRAM_MEDIA_COST = 2,
|
||||
AFE_PSRAM_HIGH_COST = 3
|
||||
AFE_PSRAM_LOW_COST = 0,
|
||||
AFE_PSRAM_MIDDLE_COST = 1,
|
||||
AFE_PSRAM_HIGH_COST = 2
|
||||
} afe_use_psram_mode_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_MN_PEAK_AGC_MODE_1 = -5, // The peak amplitude of audio fed to multinet is -5dB
|
||||
AFE_MN_PEAK_AGC_MODE_2 = -4, // The peak amplitude of audio fed to multinet is -4dB
|
||||
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of audio fed to multinet is -3dB
|
||||
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
|
||||
} afe_mn_peak_agc_mode_t;
|
||||
|
||||
typedef struct {
|
||||
int total_ch_num; // total channel num. It must be: total_ch_num = mic_num + ref_num
|
||||
int mic_num; // mic channel num
|
||||
int ref_num; // reference channel num
|
||||
} afe_pcm_config_t;
|
||||
|
||||
typedef struct {
|
||||
bool aec_init;
|
||||
bool se_init;
|
||||
bool vad_init;
|
||||
bool wakenet_init;
|
||||
int vad_mode;
|
||||
const esp_wn_iface_t *wakenet_model;
|
||||
const model_coeff_getter_t *wakenet_coeff;
|
||||
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
|
||||
esp_wn_iface_t *wakenet_model;
|
||||
model_coeff_getter_t *wakenet_coeff;
|
||||
det_mode_t wakenet_mode;
|
||||
afe_sr_mode_t afe_mode;
|
||||
int afe_perferred_core;
|
||||
int afe_perferred_priority;
|
||||
int afe_ringbuf_size;
|
||||
int alloc_from_psram;
|
||||
int agc_mode;
|
||||
afe_use_psram_mode_t alloc_from_psram;
|
||||
afe_mn_peak_agc_mode_t agc_mode;
|
||||
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
|
||||
} afe_config_t;
|
||||
|
||||
|
||||
@ -54,7 +70,7 @@ typedef struct {
|
||||
.se_init = true, \
|
||||
.vad_init = true, \
|
||||
.wakenet_init = true, \
|
||||
.vad_mode = 3, \
|
||||
.vad_mode = VAD_MODE_3, \
|
||||
.wakenet_model = &WAKENET_MODEL, \
|
||||
.wakenet_coeff = &WAKENET_COEFF, \
|
||||
.wakenet_mode = DET_MODE_90, \
|
||||
@ -62,8 +78,11 @@ typedef struct {
|
||||
.afe_perferred_core = 0, \
|
||||
.afe_perferred_priority = 5, \
|
||||
.afe_ringbuf_size = 50, \
|
||||
.alloc_from_psram = 1, \
|
||||
.agc_mode = 2, \
|
||||
.alloc_from_psram = AFE_PSRAM_MIDDLE_COST, \
|
||||
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
|
||||
.pcm_config.total_ch_num = 2, \
|
||||
.pcm_config.mic_num = 1, \
|
||||
.pcm_config.ref_num = 1, \
|
||||
}
|
||||
#elif CONFIG_IDF_TARGET_ESP32S3
|
||||
#define AFE_CONFIG_DEFAULT() { \
|
||||
@ -71,7 +90,7 @@ typedef struct {
|
||||
.se_init = true, \
|
||||
.vad_init = true, \
|
||||
.wakenet_init = true, \
|
||||
.vad_mode = 3, \
|
||||
.vad_mode = VAD_MODE_3, \
|
||||
.wakenet_model = &WAKENET_MODEL, \
|
||||
.wakenet_coeff = &WAKENET_COEFF, \
|
||||
.wakenet_mode = DET_MODE_2CH_90, \
|
||||
@ -79,8 +98,11 @@ typedef struct {
|
||||
.afe_perferred_core = 0, \
|
||||
.afe_perferred_priority = 5, \
|
||||
.afe_ringbuf_size = 50, \
|
||||
.alloc_from_psram = AFE_PSRAM_MEDIA_COST, \
|
||||
.agc_mode = 2, \
|
||||
.alloc_from_psram = AFE_PSRAM_HIGH_COST, \
|
||||
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
|
||||
.pcm_config.total_ch_num = 3, \
|
||||
.pcm_config.mic_num = 2, \
|
||||
.pcm_config.ref_num = 1, \
|
||||
}
|
||||
#endif
|
||||
/**
|
||||
@ -113,10 +135,18 @@ typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_confi
|
||||
typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the channel number of samples that need to be passed to the fetch function
|
||||
* @brief Get the total channel number which be config
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of samples to feed the fetch function
|
||||
* @return The amount of total channels
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the mic channel number which be config
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of mic channels
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
@ -232,6 +262,7 @@ typedef struct {
|
||||
esp_afe_sr_iface_op_fetch_t fetch;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
|
||||
esp_afe_sr_iface_op_get_total_channel_num_t get_total_channel_num;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_channel_num;
|
||||
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
|
||||
@ -243,3 +274,5 @@ typedef struct {
|
||||
esp_afe_sr_iface_op_enable_se_t enable_se;
|
||||
esp_afe_sr_iface_op_destroy_t destroy;
|
||||
} esp_afe_sr_iface_t;
|
||||
|
||||
extern esp_afe_sr_iface_t esp_afe_sr;
|
||||
|
||||
@ -1,20 +1,33 @@
|
||||
#pragma once
|
||||
#include "stdint.h"
|
||||
// #include "esp_err.h"
|
||||
#include "dl_lib_coefgetter_if.h"
|
||||
#include "esp_wn_iface.h"
|
||||
// //Opaque model data container
|
||||
// typedef struct model_iface_data_t model_iface_data_t;
|
||||
|
||||
// Return all possible recognition results
|
||||
#define ESP_MN_RESULT_MAX_NUM 5
|
||||
typedef enum {
|
||||
ESP_MN_STATE_DETECTING = 0, // detecting
|
||||
ESP_MN_STATE_DETECTED = 1, // detected
|
||||
ESP_MN_STATE_TIMEOUT = 2, // time out
|
||||
} esp_mn_state_t;
|
||||
|
||||
typedef struct{
|
||||
esp_mn_state_t state;
|
||||
int num; // The number of phrase in list, num<=5. When num=0, no phrase is recognized.
|
||||
int command_id[ESP_MN_RESULT_MAX_NUM]; // The list of command id.
|
||||
int phrase_id[ESP_MN_RESULT_MAX_NUM]; // The list of phrase id.
|
||||
float prob[ESP_MN_RESULT_MAX_NUM]; // The list of probability.
|
||||
} esp_mn_results_t;
|
||||
|
||||
/**
|
||||
* @brief Initialze a model instance with specified model coefficient.
|
||||
*
|
||||
* @param coeff The wakenet model coefficient.
|
||||
* @param coeff The wakenet model coefficient.
|
||||
* @param duration The duration (ms) to trigger the timeout
|
||||
* @parm sample_length Audio length for speech recognition, in ms.
|
||||
* @returns Handle to the model data.
|
||||
*/
|
||||
typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const model_coeff_getter_t *coeff, int sample_length);
|
||||
typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const model_coeff_getter_t *coeff, int duration);
|
||||
|
||||
|
||||
/**
|
||||
@ -96,16 +109,22 @@ typedef int (*esp_mn_iface_op_get_det_phrase_id_t)(model_iface_data_t *model);
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_destroy_t)(model_iface_data_t *model);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Reset the speech commands
|
||||
*
|
||||
* @param model_data The model object to query.
|
||||
* @param command_str The string of new commands.
|
||||
* @param err_phrase_id Wrong phrase ID string.
|
||||
* @brief Reset the speech commands recognition model
|
||||
*
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_reset_t)(model_iface_data_t *model_data, char *command_str, char *err_phrase_id);
|
||||
|
||||
/**
|
||||
* @brief Get recognition results
|
||||
*
|
||||
* @param model The Model object to destroy
|
||||
*
|
||||
* @return The current results.
|
||||
*/
|
||||
typedef esp_mn_results_t* (*esp_mn_iface_op_get_results_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Reset the speech commands recognition model
|
||||
*
|
||||
@ -134,6 +153,7 @@ typedef struct {
|
||||
esp_mn_iface_op_detect_t detect;
|
||||
esp_mn_iface_op_destroy_t destroy;
|
||||
esp_mn_iface_op_reset_t reset;
|
||||
esp_mn_iface_op_get_results_t get_results;
|
||||
esp_mn_iface_op_wakenet_reset_t wakenet_reset;
|
||||
esp_mn_iface_op_close_log_t close_log;
|
||||
} esp_mn_iface_t;
|
||||
|
||||
@ -96,6 +96,9 @@ extern const esp_wn_iface_t esp_sr_wakenet8_quantized8;
|
||||
#elif CONFIG_SR_WN_WN8_HIESP & CONFIG_SR_WN_MODEL_WN8_QUANT8
|
||||
#define WAKENET_COEFF "hiesp8q8"
|
||||
|
||||
#elif CONFIG_SR_WN_WN8_HILEXIN & CONFIG_SR_WN_MODEL_WN8_QUANT
|
||||
#define WAKENET_COEFF "hilexin8"
|
||||
|
||||
#else
|
||||
#error No valid wake word selected.
|
||||
#endif
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1 +1 @@
|
||||
4e7ee1f3d6dbe62bf556d209d69fb331dbacc72a
|
||||
c008766c5e30abbd2e418086a688de59359ff1df
|
||||
@ -48,6 +48,8 @@ elif "CONFIG_SR_WN_WN7_ALEXA" in WN_STRING and "CONFIG_SR_WN_MODEL_WN7_QUANT8" i
|
||||
wakenet_model = 'alexa7q8'
|
||||
elif "CONFIG_SR_WN_WN7_ALEXA" in WN_STRING and "CONFIG_SR_WN_MODEL_WN7_QUANT" in WN_STRING:
|
||||
wakenet_model = 'alexa7'
|
||||
elif "CONFIG_SR_WN_WN8_HILEXIN" in WN_STRING and "CONFIG_SR_WN_MODEL_WN8_QUANT" in WN_STRING:
|
||||
wakenet_model = 'hilexin8'
|
||||
elif "CONFIG_SR_WN_WN8_ALEXA" in WN_STRING and "CONFIG_SR_WN_MODEL_WN8_QUANT" in WN_STRING:
|
||||
wakenet_model = 'alexa8'
|
||||
elif "CONFIG_SR_WN_WN8_HIESP" in WN_STRING and "CONFIG_SR_WN_MODEL_WN8_QUANT8" in WN_STRING:
|
||||
@ -75,7 +77,6 @@ print(wakenet_model)
|
||||
print(multinet_model)
|
||||
|
||||
target_model = model_path + '/target'
|
||||
|
||||
if os.path.exists(target_model):
|
||||
shutil.rmtree(target_model)
|
||||
os.makedirs(target_model)
|
||||
|
||||
@ -1 +1 @@
|
||||
MN5Q8_v1_english_8_0.9_0.90
|
||||
MN5Q8_v2_english_8_0.9_0.90
|
||||
Binary file not shown.
@ -1 +1 @@
|
||||
wakeNet8_v5h8_alexa_5_0.57_0.59
|
||||
wakeNet8_v5_alexa_5_0.55_0.54
|
||||
@ -1 +1 @@
|
||||
WakeNet8_v3h8_hiesp_5_0.59_0.60
|
||||
WakeNet8_v3h8_hiesp_5_0.60_0.616
|
||||
Binary file not shown.
1
model/wakenet_model/hilexin8/_MODEL_INFO_
Normal file
1
model/wakenet_model/hilexin8/_MODEL_INFO_
Normal file
@ -0,0 +1 @@
|
||||
WakeNet8_v3h8_hilexin_5_0.625_0.635
|
||||
BIN
model/wakenet_model/hilexin8/wn8_data
Normal file
BIN
model/wakenet_model/hilexin8/wn8_data
Normal file
Binary file not shown.
BIN
model/wakenet_model/hilexin8/wn8_index
Normal file
BIN
model/wakenet_model/hilexin8/wn8_index
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user