feat(MN): Support continuous recognition

2025-09-15 15:28:44 +08:00 · 2022-01-19 16:13:04 +08:00 · 2022-01-19 16:13:04 +08:00 · 4d3e550e72
commit 4d3e550e72
parent d05cf97972
19 changed files with 97 additions and 36 deletions
--- a/Kconfig.projbuild
+++ b/Kconfig.projbuild
@ -108,6 +108,10 @@ choice SR_WN_WAKE_WORD_SEL
    config SR_WN_WN7_HILEXIN
        bool "Hi,Lexin (WakeNet7)"
        depends on SR_WN_MODEL_WN7_QUANT || SR_WN_MODEL_WN7_QUANT8
+    
+    config SR_WN_WN8_HILEXIN
+        bool "Hi,Lexin (WakeNet8)"
+        depends on SR_WN_MODEL_WN8_QUANT 

    config SR_WN_WN8_ALEXA
        bool "Alexa (WakeNet8)"
@ -177,7 +181,6 @@ choice SR_MN_MODE_SEL
    config SR_MN_CN_MULTINET4_5_SINGLE_RECOGNITION
        bool "chinese single recognition (MultiNet4.5)"
        depends on SR_MN_CHINESE && IDF_TARGET_ESP32S3
-
 endchoice

 menu "Add speech commands"
--- a/include/esp32s3/esp_afe_sr_iface.h
+++ b/include/esp32s3/esp_afe_sr_iface.h
@ -2,6 +2,7 @@
 #include "stdint.h"
 #include "esp_wn_iface.h"
 #include "esp_wn_models.h"
+#include "esp_vad.h"

 //AFE: Audio Front-End 
 //SR:  Speech Recognition
@ -18,33 +19,48 @@ typedef enum {

 // the output state of fetch function
 typedef enum {
-    AFE_FETCH_CHANNEL_VERIFIED = -2,  // wwe state: output channel is verified
-    AFE_FETCH_NOISE = -1,             // vad state: noise or silence
-    AFE_FETCH_SPEECH = 0,             // vad state: speech
-    AFE_FETCH_WWE_DETECTED = 1        // wwe state: wake word is detected
+    AFE_FETCH_ERROR = -3,                   // fetch empty data, retry it
+    AFE_FETCH_CHANNEL_VERIFIED = -2,        // wwe state: output channel is verified
+    AFE_FETCH_NOISE = -1,                   // vad state: noise or silence
+    AFE_FETCH_SPEECH = 0,                   // vad state: speech
+    AFE_FETCH_WWE_DETECTED = 1              // wwe state: wake word is detected
 } afe_fetch_mode_t;

 typedef enum {
-    AFE_PSRAM_LOW_COST = 1,
-    AFE_PSRAM_MEDIA_COST = 2,
-    AFE_PSRAM_HIGH_COST = 3
+    AFE_PSRAM_LOW_COST = 0,
+    AFE_PSRAM_MIDDLE_COST = 1,
+    AFE_PSRAM_HIGH_COST = 2
 } afe_use_psram_mode_t;

+typedef enum {
+    AFE_MN_PEAK_AGC_MODE_1 = -5,            // The peak amplitude of audio fed to multinet is -5dB
+    AFE_MN_PEAK_AGC_MODE_2 = -4,            // The peak amplitude of audio fed to multinet is -4dB
+    AFE_MN_PEAK_AGC_MODE_3 = -3,            // The peak amplitude of audio fed to multinet is -3dB
+    AFE_MN_PEAK_NO_AGC = 0,                 // There is no agc gain
+} afe_mn_peak_agc_mode_t;
+
+typedef struct {
+    int total_ch_num;                       // total channel num. It must be: total_ch_num = mic_num + ref_num
+    int mic_num;                            // mic channel num
+    int ref_num;                            // reference channel num
+} afe_pcm_config_t;
+
 typedef struct {
    bool aec_init;
    bool se_init;
    bool vad_init;
    bool wakenet_init;
-    int vad_mode;
-    const esp_wn_iface_t *wakenet_model;
-    const model_coeff_getter_t *wakenet_coeff;
+    vad_mode_t vad_mode;                    // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
+    esp_wn_iface_t *wakenet_model;
+    model_coeff_getter_t *wakenet_coeff;
    det_mode_t wakenet_mode;
    afe_sr_mode_t afe_mode;
    int afe_perferred_core;
    int afe_perferred_priority;
    int afe_ringbuf_size;
-    int alloc_from_psram;
-    int agc_mode;
+    afe_use_psram_mode_t alloc_from_psram;
+    afe_mn_peak_agc_mode_t agc_mode;
+    afe_pcm_config_t pcm_config;            // Config the channel num of original data which is fed to the afe feed function.
 } afe_config_t;


@ -54,7 +70,7 @@ typedef struct {
    .se_init = true, \
    .vad_init = true, \
    .wakenet_init = true, \
-    .vad_mode = 3, \
+    .vad_mode = VAD_MODE_3, \
    .wakenet_model = &WAKENET_MODEL, \
    .wakenet_coeff = &WAKENET_COEFF, \
    .wakenet_mode = DET_MODE_90, \
@ -62,8 +78,11 @@ typedef struct {
    .afe_perferred_core = 0, \
    .afe_perferred_priority = 5, \
    .afe_ringbuf_size = 50, \
-    .alloc_from_psram = 1, \
-    .agc_mode = 2, \
+    .alloc_from_psram = AFE_PSRAM_MIDDLE_COST, \
+    .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
+    .pcm_config.total_ch_num = 2, \
+    .pcm_config.mic_num = 1, \
+    .pcm_config.ref_num = 1, \
 }
 #elif CONFIG_IDF_TARGET_ESP32S3
 #define AFE_CONFIG_DEFAULT() { \
@ -71,7 +90,7 @@ typedef struct {
    .se_init = true, \
    .vad_init = true, \
    .wakenet_init = true, \
-    .vad_mode = 3, \
+    .vad_mode = VAD_MODE_3, \
    .wakenet_model = &WAKENET_MODEL, \
    .wakenet_coeff = &WAKENET_COEFF, \
    .wakenet_mode = DET_MODE_2CH_90, \
@ -79,8 +98,11 @@ typedef struct {
    .afe_perferred_core = 0, \
    .afe_perferred_priority = 5, \
    .afe_ringbuf_size = 50, \
-    .alloc_from_psram = AFE_PSRAM_MEDIA_COST, \
-    .agc_mode = 2, \
+    .alloc_from_psram = AFE_PSRAM_HIGH_COST, \
+    .agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
+    .pcm_config.total_ch_num = 3, \
+    .pcm_config.mic_num = 2, \
+    .pcm_config.ref_num = 1, \
 }
 #endif
 /**
@ -113,10 +135,18 @@ typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_confi
 typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);

 /**
- * @brief Get the channel number of samples that need to be passed to the fetch function
+ * @brief Get the total channel number which be config
 * 
 * @param afe The AFE_SR object to query
- * @return The amount of samples to feed the fetch function
+ * @return The amount of total channels
+ */
+typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe);
+
+/**
+ * @brief Get the mic channel number which be config
+ * 
+ * @param afe The AFE_SR object to query
+ * @return The amount of mic channels
 */
 typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);

@ -232,6 +262,7 @@ typedef struct {
    esp_afe_sr_iface_op_fetch_t fetch;
    esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
    esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
+    esp_afe_sr_iface_op_get_total_channel_num_t get_total_channel_num;
    esp_afe_sr_iface_op_get_channel_num_t get_channel_num;
    esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
    esp_afe_sr_iface_op_set_wakenet_t  set_wakenet; 
@ -243,3 +274,5 @@ typedef struct {
    esp_afe_sr_iface_op_enable_se_t enable_se;
    esp_afe_sr_iface_op_destroy_t destroy;
 } esp_afe_sr_iface_t;
+
+extern esp_afe_sr_iface_t esp_afe_sr;
--- a/include/esp32s3/esp_mn_iface.h
+++ b/include/esp32s3/esp_mn_iface.h
@ -1,20 +1,33 @@
 #pragma once
 #include "stdint.h"
-// #include "esp_err.h"
 #include "dl_lib_coefgetter_if.h"
 #include "esp_wn_iface.h"
-// //Opaque model data container
-// typedef struct model_iface_data_t model_iface_data_t;
+
+// Return all possible recognition results
+#define ESP_MN_RESULT_MAX_NUM 5
+typedef enum {
+	ESP_MN_STATE_DETECTING = 0,     // detecting
+	ESP_MN_STATE_DETECTED = 1,      // detected
+    ESP_MN_STATE_TIMEOUT = 2,       // time out
+} esp_mn_state_t;
+
+typedef struct{
+    esp_mn_state_t state;
+    int num;                // The number of phrase in list, num<=5. When num=0, no phrase is recognized.
+    int command_id[ESP_MN_RESULT_MAX_NUM];     // The list of command id.
+    int phrase_id[ESP_MN_RESULT_MAX_NUM];      // The list of phrase id.
+    float prob[ESP_MN_RESULT_MAX_NUM];         // The list of probability.
+} esp_mn_results_t;

 /**
 * @brief Initialze a model instance with specified model coefficient.
 *
 * @param coeff       The wakenet model coefficient.
- * @param coeff        The wakenet model coefficient.
+ * @param duration    The duration (ms) to trigger the timeout
 * @parm sample_length Audio length for speech recognition, in ms.
 * @returns Handle to the model data.
 */
-typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const model_coeff_getter_t *coeff, int sample_length);
+typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const model_coeff_getter_t *coeff, int duration);


 /**
@ -96,16 +109,22 @@ typedef int (*esp_mn_iface_op_get_det_phrase_id_t)(model_iface_data_t *model);
 */
 typedef void (*esp_mn_iface_op_destroy_t)(model_iface_data_t *model);

+
 /**
- * @brief Reset the speech commands
- *
- * @param model_data       The model object to query.
- * @param command_str      The string of new commands.
- * @param err_phrase_id    Wrong phrase ID string.
+ * @brief Reset the speech commands recognition model
 *
 */
 typedef void (*esp_mn_iface_op_reset_t)(model_iface_data_t *model_data, char *command_str, char *err_phrase_id);

+/**
+ * @brief Get recognition results 
+ *
+ * @param model       The Model object to destroy
+ * 
+ * @return The current results.
+ */
+typedef esp_mn_results_t* (*esp_mn_iface_op_get_results_t)(model_iface_data_t *model);
+
 /**
 * @brief Reset the speech commands recognition model
 *
@ -134,6 +153,7 @@ typedef struct {
    esp_mn_iface_op_detect_t detect; 
    esp_mn_iface_op_destroy_t destroy;
    esp_mn_iface_op_reset_t reset;
+    esp_mn_iface_op_get_results_t get_results;
    esp_mn_iface_op_wakenet_reset_t wakenet_reset;
    esp_mn_iface_op_close_log_t close_log;
 } esp_mn_iface_t;
--- a/include/esp32s3/esp_wn_models.h
+++ b/include/esp32s3/esp_wn_models.h
@ -96,6 +96,9 @@ extern const esp_wn_iface_t esp_sr_wakenet8_quantized8;
 #elif CONFIG_SR_WN_WN8_HIESP & CONFIG_SR_WN_MODEL_WN8_QUANT8
 #define WAKENET_COEFF "hiesp8q8"

+#elif CONFIG_SR_WN_WN8_HILEXIN & CONFIG_SR_WN_MODEL_WN8_QUANT
+#define WAKENET_COEFF "hilexin8"
+
 #else
 #error No valid wake word selected.
 #endif
--- a/lib/esp32s3/libdl_lib.a
+++ b/lib/esp32s3/libdl_lib.a
--- a/lib/esp32s3/libesp_audio_front_end.a
+++ b/lib/esp32s3/libesp_audio_front_end.a
--- a/lib/esp32s3/libesp_audio_processor.a
+++ b/lib/esp32s3/libesp_audio_processor.a
--- a/lib/esp32s3/libmultinet.a
+++ b/lib/esp32s3/libmultinet.a
--- a/lib/esp32s3/libwakenet.a
+++ b/lib/esp32s3/libwakenet.a
--- a/2
+++ b/2
@ -1 +1 @@
-4e7ee1f3d6dbe62bf556d209d69fb331dbacc72a
+c008766c5e30abbd2e418086a688de59359ff1df
--- a/model/movemodel.py
+++ b/model/movemodel.py
@ -48,6 +48,8 @@ elif "CONFIG_SR_WN_WN7_ALEXA" in WN_STRING and "CONFIG_SR_WN_MODEL_WN7_QUANT8" i
    wakenet_model = 'alexa7q8'
 elif "CONFIG_SR_WN_WN7_ALEXA" in WN_STRING and "CONFIG_SR_WN_MODEL_WN7_QUANT" in WN_STRING:
    wakenet_model = 'alexa7'
+elif "CONFIG_SR_WN_WN8_HILEXIN" in WN_STRING and "CONFIG_SR_WN_MODEL_WN8_QUANT" in WN_STRING:
+    wakenet_model = 'hilexin8'
 elif "CONFIG_SR_WN_WN8_ALEXA" in WN_STRING and "CONFIG_SR_WN_MODEL_WN8_QUANT" in WN_STRING:
    wakenet_model = 'alexa8'
 elif "CONFIG_SR_WN_WN8_HIESP" in WN_STRING and "CONFIG_SR_WN_MODEL_WN8_QUANT8" in WN_STRING:
@ -75,7 +77,6 @@ print(wakenet_model)
 print(multinet_model)

 target_model = model_path + '/target'
-
 if os.path.exists(target_model):
    shutil.rmtree(target_model)
 os.makedirs(target_model)
--- a/model/multinet_model/mn5q8en/_MODEL_INFO_
+++ b/model/multinet_model/mn5q8en/_MODEL_INFO_
@ -1 +1 @@
-MN5Q8_v1_english_8_0.9_0.90
+MN5Q8_v2_english_8_0.9_0.90
--- a/model/multinet_model/mn5q8en/mn5q8en_data
+++ b/model/multinet_model/mn5q8en/mn5q8en_data
--- a/model/wakenet_model/alexa8/_MODEL_INFO_
+++ b/model/wakenet_model/alexa8/_MODEL_INFO_
@ -1 +1 @@
-wakeNet8_v5h8_alexa_5_0.57_0.59
+wakeNet8_v5_alexa_5_0.55_0.54
--- a/model/wakenet_model/hiesp8/_MODEL_INFO_
+++ b/model/wakenet_model/hiesp8/_MODEL_INFO_
@ -1 +1 @@
-WakeNet8_v3h8_hiesp_5_0.59_0.60
+WakeNet8_v3h8_hiesp_5_0.60_0.616
--- a/model/wakenet_model/hiesp8/wn8_data
+++ b/model/wakenet_model/hiesp8/wn8_data
--- a/model/wakenet_model/hilexin8/_MODEL_INFO_
+++ b/model/wakenet_model/hilexin8/_MODEL_INFO_
@ -0,0 +1 @@
+WakeNet8_v3h8_hilexin_5_0.625_0.635
--- a/model/wakenet_model/hilexin8/wn8_data
+++ b/model/wakenet_model/hilexin8/wn8_data
--- a/model/wakenet_model/hilexin8/wn8_index
+++ b/model/wakenet_model/hilexin8/wn8_index