diff --git a/Kconfig.projbuild b/Kconfig.projbuild index 1259325..03cdb40 100644 --- a/Kconfig.projbuild +++ b/Kconfig.projbuild @@ -108,6 +108,10 @@ choice SR_WN_WAKE_WORD_SEL config SR_WN_WN7_HILEXIN bool "Hi,Lexin (WakeNet7)" depends on SR_WN_MODEL_WN7_QUANT || SR_WN_MODEL_WN7_QUANT8 + + config SR_WN_WN8_HILEXIN + bool "Hi,Lexin (WakeNet8)" + depends on SR_WN_MODEL_WN8_QUANT config SR_WN_WN8_ALEXA bool "Alexa (WakeNet8)" @@ -177,7 +181,6 @@ choice SR_MN_MODE_SEL config SR_MN_CN_MULTINET4_5_SINGLE_RECOGNITION bool "chinese single recognition (MultiNet4.5)" depends on SR_MN_CHINESE && IDF_TARGET_ESP32S3 - endchoice menu "Add speech commands" diff --git a/include/esp32s3/esp_afe_sr_iface.h b/include/esp32s3/esp_afe_sr_iface.h index 73f60c3..ada84a8 100644 --- a/include/esp32s3/esp_afe_sr_iface.h +++ b/include/esp32s3/esp_afe_sr_iface.h @@ -2,6 +2,7 @@ #include "stdint.h" #include "esp_wn_iface.h" #include "esp_wn_models.h" +#include "esp_vad.h" //AFE: Audio Front-End //SR: Speech Recognition @@ -18,33 +19,48 @@ typedef enum { // the output state of fetch function typedef enum { - AFE_FETCH_CHANNEL_VERIFIED = -2, // wwe state: output channel is verified - AFE_FETCH_NOISE = -1, // vad state: noise or silence - AFE_FETCH_SPEECH = 0, // vad state: speech - AFE_FETCH_WWE_DETECTED = 1 // wwe state: wake word is detected + AFE_FETCH_ERROR = -3, // fetch empty data, retry it + AFE_FETCH_CHANNEL_VERIFIED = -2, // wwe state: output channel is verified + AFE_FETCH_NOISE = -1, // vad state: noise or silence + AFE_FETCH_SPEECH = 0, // vad state: speech + AFE_FETCH_WWE_DETECTED = 1 // wwe state: wake word is detected } afe_fetch_mode_t; typedef enum { - AFE_PSRAM_LOW_COST = 1, - AFE_PSRAM_MEDIA_COST = 2, - AFE_PSRAM_HIGH_COST = 3 + AFE_PSRAM_LOW_COST = 0, + AFE_PSRAM_MIDDLE_COST = 1, + AFE_PSRAM_HIGH_COST = 2 } afe_use_psram_mode_t; +typedef enum { + AFE_MN_PEAK_AGC_MODE_1 = -5, // The peak amplitude of audio fed to multinet is -5dB + AFE_MN_PEAK_AGC_MODE_2 = -4, // The peak amplitude of audio fed to multinet is -4dB + AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of audio fed to multinet is -3dB + AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain +} afe_mn_peak_agc_mode_t; + +typedef struct { + int total_ch_num; // total channel num. It must be: total_ch_num = mic_num + ref_num + int mic_num; // mic channel num + int ref_num; // reference channel num +} afe_pcm_config_t; + typedef struct { bool aec_init; bool se_init; bool vad_init; bool wakenet_init; - int vad_mode; - const esp_wn_iface_t *wakenet_model; - const model_coeff_getter_t *wakenet_coeff; + vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4 + esp_wn_iface_t *wakenet_model; + model_coeff_getter_t *wakenet_coeff; det_mode_t wakenet_mode; afe_sr_mode_t afe_mode; int afe_perferred_core; int afe_perferred_priority; int afe_ringbuf_size; - int alloc_from_psram; - int agc_mode; + afe_use_psram_mode_t alloc_from_psram; + afe_mn_peak_agc_mode_t agc_mode; + afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function. } afe_config_t; @@ -54,7 +70,7 @@ typedef struct { .se_init = true, \ .vad_init = true, \ .wakenet_init = true, \ - .vad_mode = 3, \ + .vad_mode = VAD_MODE_3, \ .wakenet_model = &WAKENET_MODEL, \ .wakenet_coeff = &WAKENET_COEFF, \ .wakenet_mode = DET_MODE_90, \ @@ -62,8 +78,11 @@ typedef struct { .afe_perferred_core = 0, \ .afe_perferred_priority = 5, \ .afe_ringbuf_size = 50, \ - .alloc_from_psram = 1, \ - .agc_mode = 2, \ + .alloc_from_psram = AFE_PSRAM_MIDDLE_COST, \ + .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \ + .pcm_config.total_ch_num = 2, \ + .pcm_config.mic_num = 1, \ + .pcm_config.ref_num = 1, \ } #elif CONFIG_IDF_TARGET_ESP32S3 #define AFE_CONFIG_DEFAULT() { \ @@ -71,7 +90,7 @@ typedef struct { .se_init = true, \ .vad_init = true, \ .wakenet_init = true, \ - .vad_mode = 3, \ + .vad_mode = VAD_MODE_3, \ .wakenet_model = &WAKENET_MODEL, \ .wakenet_coeff = &WAKENET_COEFF, \ .wakenet_mode = DET_MODE_2CH_90, \ @@ -79,8 +98,11 @@ typedef struct { .afe_perferred_core = 0, \ .afe_perferred_priority = 5, \ .afe_ringbuf_size = 50, \ - .alloc_from_psram = AFE_PSRAM_MEDIA_COST, \ - .agc_mode = 2, \ + .alloc_from_psram = AFE_PSRAM_HIGH_COST, \ + .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \ + .pcm_config.total_ch_num = 3, \ + .pcm_config.mic_num = 2, \ + .pcm_config.ref_num = 1, \ } #endif /** @@ -113,10 +135,18 @@ typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_confi typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe); /** - * @brief Get the channel number of samples that need to be passed to the fetch function + * @brief Get the total channel number which be config * * @param afe The AFE_SR object to query - * @return The amount of samples to feed the fetch function + * @return The amount of total channels + */ +typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe); + +/** + * @brief Get the mic channel number which be config + * + * @param afe The AFE_SR object to query + * @return The amount of mic channels */ typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe); @@ -232,6 +262,7 @@ typedef struct { esp_afe_sr_iface_op_fetch_t fetch; esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize; esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize; + esp_afe_sr_iface_op_get_total_channel_num_t get_total_channel_num; esp_afe_sr_iface_op_get_channel_num_t get_channel_num; esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate; esp_afe_sr_iface_op_set_wakenet_t set_wakenet; @@ -243,3 +274,5 @@ typedef struct { esp_afe_sr_iface_op_enable_se_t enable_se; esp_afe_sr_iface_op_destroy_t destroy; } esp_afe_sr_iface_t; + +extern esp_afe_sr_iface_t esp_afe_sr; diff --git a/include/esp32s3/esp_mn_iface.h b/include/esp32s3/esp_mn_iface.h index 79fa957..334fd75 100644 --- a/include/esp32s3/esp_mn_iface.h +++ b/include/esp32s3/esp_mn_iface.h @@ -1,20 +1,33 @@ #pragma once #include "stdint.h" -// #include "esp_err.h" #include "dl_lib_coefgetter_if.h" #include "esp_wn_iface.h" -// //Opaque model data container -// typedef struct model_iface_data_t model_iface_data_t; + +// Return all possible recognition results +#define ESP_MN_RESULT_MAX_NUM 5 +typedef enum { + ESP_MN_STATE_DETECTING = 0, // detecting + ESP_MN_STATE_DETECTED = 1, // detected + ESP_MN_STATE_TIMEOUT = 2, // time out +} esp_mn_state_t; + +typedef struct{ + esp_mn_state_t state; + int num; // The number of phrase in list, num<=5. When num=0, no phrase is recognized. + int command_id[ESP_MN_RESULT_MAX_NUM]; // The list of command id. + int phrase_id[ESP_MN_RESULT_MAX_NUM]; // The list of phrase id. + float prob[ESP_MN_RESULT_MAX_NUM]; // The list of probability. +} esp_mn_results_t; /** * @brief Initialze a model instance with specified model coefficient. * * @param coeff The wakenet model coefficient. - * @param coeff The wakenet model coefficient. + * @param duration The duration (ms) to trigger the timeout * @parm sample_length Audio length for speech recognition, in ms. * @returns Handle to the model data. */ -typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const model_coeff_getter_t *coeff, int sample_length); +typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const model_coeff_getter_t *coeff, int duration); /** @@ -96,16 +109,22 @@ typedef int (*esp_mn_iface_op_get_det_phrase_id_t)(model_iface_data_t *model); */ typedef void (*esp_mn_iface_op_destroy_t)(model_iface_data_t *model); + /** - * @brief Reset the speech commands - * - * @param model_data The model object to query. - * @param command_str The string of new commands. - * @param err_phrase_id Wrong phrase ID string. + * @brief Reset the speech commands recognition model * */ typedef void (*esp_mn_iface_op_reset_t)(model_iface_data_t *model_data, char *command_str, char *err_phrase_id); +/** + * @brief Get recognition results + * + * @param model The Model object to destroy + * + * @return The current results. + */ +typedef esp_mn_results_t* (*esp_mn_iface_op_get_results_t)(model_iface_data_t *model); + /** * @brief Reset the speech commands recognition model * @@ -134,6 +153,7 @@ typedef struct { esp_mn_iface_op_detect_t detect; esp_mn_iface_op_destroy_t destroy; esp_mn_iface_op_reset_t reset; + esp_mn_iface_op_get_results_t get_results; esp_mn_iface_op_wakenet_reset_t wakenet_reset; esp_mn_iface_op_close_log_t close_log; } esp_mn_iface_t; diff --git a/include/esp32s3/esp_wn_models.h b/include/esp32s3/esp_wn_models.h index 225456f..e830545 100644 --- a/include/esp32s3/esp_wn_models.h +++ b/include/esp32s3/esp_wn_models.h @@ -96,6 +96,9 @@ extern const esp_wn_iface_t esp_sr_wakenet8_quantized8; #elif CONFIG_SR_WN_WN8_HIESP & CONFIG_SR_WN_MODEL_WN8_QUANT8 #define WAKENET_COEFF "hiesp8q8" +#elif CONFIG_SR_WN_WN8_HILEXIN & CONFIG_SR_WN_MODEL_WN8_QUANT +#define WAKENET_COEFF "hilexin8" + #else #error No valid wake word selected. #endif diff --git a/lib/esp32s3/libdl_lib.a b/lib/esp32s3/libdl_lib.a index 026588c..ac78fd6 100644 Binary files a/lib/esp32s3/libdl_lib.a and b/lib/esp32s3/libdl_lib.a differ diff --git a/lib/esp32s3/libesp_audio_front_end.a b/lib/esp32s3/libesp_audio_front_end.a index 3d46107..e0e5205 100644 Binary files a/lib/esp32s3/libesp_audio_front_end.a and b/lib/esp32s3/libesp_audio_front_end.a differ diff --git a/lib/esp32s3/libesp_audio_processor.a b/lib/esp32s3/libesp_audio_processor.a index 584fbe7..1131429 100644 Binary files a/lib/esp32s3/libesp_audio_processor.a and b/lib/esp32s3/libesp_audio_processor.a differ diff --git a/lib/esp32s3/libmultinet.a b/lib/esp32s3/libmultinet.a index f4efd9e..2190cde 100644 Binary files a/lib/esp32s3/libmultinet.a and b/lib/esp32s3/libmultinet.a differ diff --git a/lib/esp32s3/libwakenet.a b/lib/esp32s3/libwakenet.a index c0ebff4..c4acb1a 100644 Binary files a/lib/esp32s3/libwakenet.a and b/lib/esp32s3/libwakenet.a differ diff --git a/libversion b/libversion index ea36c61..826a98d 100644 --- a/libversion +++ b/libversion @@ -1 +1 @@ -4e7ee1f3d6dbe62bf556d209d69fb331dbacc72a \ No newline at end of file +c008766c5e30abbd2e418086a688de59359ff1df \ No newline at end of file diff --git a/model/movemodel.py b/model/movemodel.py index 134b5a8..8528c7d 100644 --- a/model/movemodel.py +++ b/model/movemodel.py @@ -48,6 +48,8 @@ elif "CONFIG_SR_WN_WN7_ALEXA" in WN_STRING and "CONFIG_SR_WN_MODEL_WN7_QUANT8" i wakenet_model = 'alexa7q8' elif "CONFIG_SR_WN_WN7_ALEXA" in WN_STRING and "CONFIG_SR_WN_MODEL_WN7_QUANT" in WN_STRING: wakenet_model = 'alexa7' +elif "CONFIG_SR_WN_WN8_HILEXIN" in WN_STRING and "CONFIG_SR_WN_MODEL_WN8_QUANT" in WN_STRING: + wakenet_model = 'hilexin8' elif "CONFIG_SR_WN_WN8_ALEXA" in WN_STRING and "CONFIG_SR_WN_MODEL_WN8_QUANT" in WN_STRING: wakenet_model = 'alexa8' elif "CONFIG_SR_WN_WN8_HIESP" in WN_STRING and "CONFIG_SR_WN_MODEL_WN8_QUANT8" in WN_STRING: @@ -75,7 +77,6 @@ print(wakenet_model) print(multinet_model) target_model = model_path + '/target' - if os.path.exists(target_model): shutil.rmtree(target_model) os.makedirs(target_model) diff --git a/model/multinet_model/mn5q8en/_MODEL_INFO_ b/model/multinet_model/mn5q8en/_MODEL_INFO_ index 79f2a2e..2488263 100644 --- a/model/multinet_model/mn5q8en/_MODEL_INFO_ +++ b/model/multinet_model/mn5q8en/_MODEL_INFO_ @@ -1 +1 @@ -MN5Q8_v1_english_8_0.9_0.90 \ No newline at end of file +MN5Q8_v2_english_8_0.9_0.90 \ No newline at end of file diff --git a/model/multinet_model/mn5q8en/mn5q8en_data b/model/multinet_model/mn5q8en/mn5q8en_data index 21d7298..2a6a73a 100644 Binary files a/model/multinet_model/mn5q8en/mn5q8en_data and b/model/multinet_model/mn5q8en/mn5q8en_data differ diff --git a/model/wakenet_model/alexa8/_MODEL_INFO_ b/model/wakenet_model/alexa8/_MODEL_INFO_ index 7692715..e0d78b6 100644 --- a/model/wakenet_model/alexa8/_MODEL_INFO_ +++ b/model/wakenet_model/alexa8/_MODEL_INFO_ @@ -1 +1 @@ -wakeNet8_v5h8_alexa_5_0.57_0.59 +wakeNet8_v5_alexa_5_0.55_0.54 \ No newline at end of file diff --git a/model/wakenet_model/hiesp8/_MODEL_INFO_ b/model/wakenet_model/hiesp8/_MODEL_INFO_ index c85386e..4bf43d1 100644 --- a/model/wakenet_model/hiesp8/_MODEL_INFO_ +++ b/model/wakenet_model/hiesp8/_MODEL_INFO_ @@ -1 +1 @@ -WakeNet8_v3h8_hiesp_5_0.59_0.60 +WakeNet8_v3h8_hiesp_5_0.60_0.616 \ No newline at end of file diff --git a/model/wakenet_model/hiesp8/wn8_data b/model/wakenet_model/hiesp8/wn8_data index 2d1ec77..491a3eb 100644 Binary files a/model/wakenet_model/hiesp8/wn8_data and b/model/wakenet_model/hiesp8/wn8_data differ diff --git a/model/wakenet_model/hilexin8/_MODEL_INFO_ b/model/wakenet_model/hilexin8/_MODEL_INFO_ new file mode 100644 index 0000000..be3e4d1 --- /dev/null +++ b/model/wakenet_model/hilexin8/_MODEL_INFO_ @@ -0,0 +1 @@ +WakeNet8_v3h8_hilexin_5_0.625_0.635 diff --git a/model/wakenet_model/hilexin8/wn8_data b/model/wakenet_model/hilexin8/wn8_data new file mode 100644 index 0000000..430d06d Binary files /dev/null and b/model/wakenet_model/hilexin8/wn8_data differ diff --git a/model/wakenet_model/hilexin8/wn8_index b/model/wakenet_model/hilexin8/wn8_index new file mode 100644 index 0000000..cf8a321 Binary files /dev/null and b/model/wakenet_model/hilexin8/wn8_index differ