feature/add AFE component & update wakenet

This commit is contained in:
sxy 2021-02-25 11:01:29 +08:00
parent c5896943ea
commit d289680270
14 changed files with 259 additions and 48 deletions

View File

@ -1,5 +1,11 @@
# Change log for esp-sr
## 0.8.0
support ESP32S3 chip
add wakenet7 & update wakenet5 to support multi-channel detection
remove wakenet6
add AFE pipeline for speech recognition
## 0.7.0
add chinese tts
update noise suppression v2

View File

@ -8,6 +8,7 @@ set(COMPONENT_ADD_INCLUDEDIRS
speech_command_recognition/include
acoustic_algorithm/include
esp-tts/esp_tts_chinese/include
audio_front_end/include
)
@ -18,6 +19,7 @@ target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/wake_w
target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/speech_command_recognition")
target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/acoustic_algorithm")
target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese")
target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/audio_front_end")
IF (IDF_VER MATCHES "v4.")
add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/libmultinet.a" PRIV_REQUIRES esp-sr)
@ -28,7 +30,7 @@ ENDIF (IDF_VER MATCHES "v4.")
if(IDF_TARGET STREQUAL "esp32")
target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
wakenet
dl_lib
dl_lib_esp32
c_speech_features
hilexin_wn3
hilexin_wn4
@ -59,3 +61,15 @@ target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
voice_set_template_esp32s2
"-Wl,--end-group")
endif()
if(IDF_TARGET STREQUAL "esp32s3beta")
target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
wakenet
dl_lib_esp32s3
c_speech_features
wakeword_model
customized_word_wn5
esp_audio_front_end
"-Wl,--end-group")
endif()

View File

@ -0,0 +1,11 @@
COMPONENT_ADD_INCLUDEDIRS := include
COMPONENT_SRCDIRS := .
LIB_FILES := $(shell ls $(COMPONENT_PATH)/lib*.a)
LIBS := $(patsubst lib%.a,-l%,$(notdir $(LIB_FILES)))
COMPONENT_ADD_LDFLAGS += -L$(COMPONENT_PATH)/ $(LIBS)
ALL_LIB_FILES += $(LIB_FILES)

View File

@ -0,0 +1,150 @@
#pragma once
#include "stdint.h"
#include "esp_wn_iface.h"
#include "esp_wn_models.h"
//AFE: Audio Front-End
//SR: Speech Recognition
//afe_sr/AFE_SR: the audio front-end for speech recognition
//Opaque AFE_SR data container
typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
//Set AFE_SR mode
typedef enum {
SR_MODE_LOW_COST = 0, //LOW_COST, low memory consumption and CPU loading
SR_MODE_MEDIUM = 1, //MEDIUM
SR_MODE_HIGH_PERF = 2, //HIGH_PERF
} afe_sr_mode_t;
/**
* @brief Function to initialze a AFE_SR instance with a specified mode
*
* @param mode The mode of AFE_SR
* @param perferred_core The perferred core to be pinned.
* If all task in AFE_SR can not run in real time by only one core, the another core would be used.
* @returns Handle to the AFE_SR data
*/
typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_t)(afe_sr_mode_t mode, int perferred_core);
/**
* @brief Get the amount of each channel samples per frame that need to be passed to the function
*
* Every speech enhancement AFE_SR processes a certain number of samples at the same time. This function
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
*
* @param afe The AFE_SR object to query
* @return The amount of samples to feed the fetch function
*/
typedef int (*esp_afe_sr_iface_op_get_frame_chunksize_t)(esp_afe_sr_data_t *afe);
/**
* @brief Get the channel number of samples that need to be passed to the fetch function
*
* @param afe The AFE_SR object to query
* @return The amount of samples to feed the fetch function
*/
typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
/**
* @brief Get the sample rate of the samples to feed to the function
*
* @param afe The AFE_SR object to query
* @return The sample rate, in hz
*/
typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
/**
* @brief Feed samples of an audio stream to the AFE_SR
*
* @Warning The input data should be arranged in the format of [CH0_0, CH1_0, ..., CHN_0, CH0_1, CH0_1, ..., CHN_1, ...].
* The last channel is reference signal or far-end signal.
*
* @param afe The AFE_SR object to query
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
* `get_frame_chunksize`. The channel number can be queried `get_channel_num`.
* @return The size of input
*/
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in);
/**
* @brief fetch enhanced samples of an audio stream from the AFE_SR
*
* @Warning The output is single channel data, no matter how many channels the input is.
*
* @param afe The AFE_SR object to query
* @param out The output enhanced signal. The frame size can be queried by the `get_frame_chunksize`.
* @return The style of output, -1: noise, 0: speech, 1: wake word 1, 2: wake word 2, ...
*/
typedef int (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe, int16_t* out);
/**
* @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
* when wakenet has been initialized.
*
* @param afe The AFE_SR object to query
* @param wakenet The pointer of wakenet
* @param model_coeff The coefficient of wake word model
* @return 0: fail, 1: success
*/
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe,
esp_wn_iface_t *wakenet,
const model_coeff_getter_t *model_coeff);
/**
* @brief Disable wakenet model.
*
* @param afe The AFE_SR object to query
* @return 0: fail, 1: success
*/
typedef int (*esp_afe_sr_iface_op_disable_wakenet_t)(esp_afe_sr_data_t *afe);
/**
* @brief Enable wakenet model.
*
* @param afe The AFE_SR object to query
* @return 0: fail, 1: success
*/
typedef int (*esp_afe_sr_iface_op_enable_wakenet_t)(esp_afe_sr_data_t *afe);
/**
* @brief Disable AEC algorithm.
*
* @param afe The AFE_SR object to query
* @return 0: fail, 1: success
*/
typedef int (*esp_afe_sr_iface_op_disable_aec_t)(esp_afe_sr_data_t *afe);
/**
* @brief Enable AEC algorithm.
*
* @param afe The AFE_SR object to query
* @return 0: fail, 1: success
*/
typedef int (*esp_afe_sr_iface_op_enable_aec_t)(esp_afe_sr_data_t *afe);
/**
* @brief Destroy a AFE_SR instance
*
* @param afe AFE_SR object to destroy
*/
typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
/**
* This structure contains the functions used to do operations on a AFE_SR.
*/
typedef struct {
esp_afe_sr_iface_op_create_t create;
esp_afe_sr_iface_op_feed_t feed;
esp_afe_sr_iface_op_fetch_t fetch;
esp_afe_sr_iface_op_get_frame_chunksize_t get_frame_chunksize;
esp_afe_sr_iface_op_get_channel_num_t get_channel_num;
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
esp_afe_sr_iface_op_disable_wakenet_t disable_wakenet;
esp_afe_sr_iface_op_enable_wakenet_t enable_wakenet;
esp_afe_sr_iface_op_disable_aec_t disable_aec;
esp_afe_sr_iface_op_enable_aec_t enable_aec;
esp_afe_sr_iface_op_destroy_t destroy;
} esp_afe_sr_iface_t;

View File

@ -0,0 +1,6 @@
#pragma once
#include "esp_afe_sr_iface.h"
extern const esp_afe_sr_iface_t esp_afe_sr_2mic;
extern const esp_afe_sr_iface_t esp_afe_sr_1mic;

Binary file not shown.

Binary file not shown.

BIN
lib/libdl_lib_esp32s3.a Normal file

Binary file not shown.

Binary file not shown.

View File

@ -10,7 +10,11 @@ typedef struct model_iface_data_t model_iface_data_t;
//As a consequence also the false alarm rate goes up
typedef enum {
DET_MODE_90 = 0, //Normal, response accuracy rate about 90%
DET_MODE_95 //Aggressive, response accuracy rate about 95%
DET_MODE_95 = 1, //Aggressive, response accuracy rate about 95%
DET_MODE_2CH_90 = 2,
DET_MODE_2CH_95 = 3,
DET_MODE_3CH_90 = 4,
DET_MODE_3CH_95 = 5,
} det_mode_t;
typedef struct {
@ -27,18 +31,6 @@ typedef struct {
*/
typedef model_iface_data_t* (*esp_wn_iface_op_create_t)(const model_coeff_getter_t *model_coeff, det_mode_t det_mode);
/**
* @brief Function type to initialze a model instance with a detection mode and specified wake word coefficient
*
* Warning: Just wakeNet6 support this function to select which core to run neural network.
*
* @param det_mode The wake words detection mode to trigger wake words, DET_MODE_90 or DET_MODE_95
* @param model_coeff The specified wake word model coefficient
* @param core Core to run neural network
* @returns Handle to the model data
*/
typedef model_iface_data_t* (*esp_wn_iface_op_create_pinned_to_core_t)(const model_coeff_getter_t *model_coeff, det_mode_t det_mode, int core);
/**
* @brief Callback function type to fetch the amount of samples that need to be passed to the detect function
*
@ -50,6 +42,17 @@ typedef model_iface_data_t* (*esp_wn_iface_op_create_pinned_to_core_t)(const mod
*/
typedef int (*esp_wn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
/**
* @brief Callback function type to fetch the channel number of samples that need to be passed to the detect function
*
* Every speech recognition model processes a certain number of samples at the same time. This function
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
*
* @param model The model object to query
* @return The amount of samples to feed the detect function
*/
typedef int (*esp_wn_iface_op_get_channel_num_t)(model_iface_data_t *model);
/**
* @brief Get the sample rate of the samples to feed to the detect function
@ -109,6 +112,23 @@ typedef float (*esp_wn_iface_op_get_det_threshold_t)(model_iface_data_t *model,
*/
typedef int (*esp_wn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
/**
* @brief Get the volume gain
*
* @param model The model object to query
* @param target_db The target dB to calculate volume gain
* @returns the volume gain
*/
typedef float (*esp_wn_iface_op_get_vol_gain_t)(model_iface_data_t *model, float target_db);
/**
* @brief Get the triggered channel index. Channel index starts from zero
*
* @param model The model object to query
* @return The channel index
*/
typedef int (*esp_wn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
/**
* @brief Destroy a speech recognition model
*
@ -122,13 +142,15 @@ typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model);
*/
typedef struct {
esp_wn_iface_op_create_t create;
esp_wn_iface_op_create_pinned_to_core_t create_pinned_to_core;
esp_wn_iface_op_get_samp_chunksize_t get_samp_chunksize;
esp_wn_iface_op_get_channel_num_t get_channel_num;
esp_wn_iface_op_get_samp_rate_t get_samp_rate;
esp_wn_iface_op_get_word_num_t get_word_num;
esp_wn_iface_op_get_word_name_t get_word_name;
esp_wn_iface_op_set_det_threshold_t set_det_threshold;
esp_wn_iface_op_get_det_threshold_t get_det_threshold;
esp_wn_iface_op_get_triggered_channel_t get_triggered_channel;
esp_wn_iface_op_get_vol_gain_t get_vol_gain;
esp_wn_iface_op_detect_t detect;
esp_wn_iface_op_destroy_t destroy;
} esp_wn_iface_t;

View File

@ -4,25 +4,19 @@
//Contains declarations of all available speech recognion models. Pair this up with the right coefficients and you have a model that can recognize
//a specific phrase or word.
extern const esp_wn_iface_t esp_sr_wakenet3_quantized;
extern const esp_wn_iface_t esp_sr_wakenet4_quantized;
extern const esp_wn_iface_t esp_sr_wakenet5_quantized;
extern const esp_wn_iface_t esp_sr_wakenet5_float;
extern const esp_wn_iface_t esp_sr_wakenet6_quantized;
extern const esp_wn_iface_t esp_sr_wakenet7_quantized;
extern const esp_wn_iface_t esp_sr_wakenet7_quantized8;
/*
Configure network to use based on what's selected in menuconfig.
*/
#if CONFIG_SR_MODEL_WN3_QUANT
#define WAKENET_MODEL esp_sr_wakenet3_quantized
#elif CONFIG_SR_MODEL_WN4_QUANT
#define WAKENET_MODEL esp_sr_wakenet4_quantized
#elif CONFIG_SR_MODEL_WN5_FLOAT
#define WAKENET_MODEL esp_sr_wakenet5_float
#elif CONFIG_SR_MODEL_WN5_QUANT
#if CONFIG_SR_MODEL_WN5_QUANT
#define WAKENET_MODEL esp_sr_wakenet5_quantized
#elif CONFIG_SR_MODEL_WN6_QUANT
#define WAKENET_MODEL esp_sr_wakenet6_quantized
#elif CONFIG_SR_MODEL_WN7_QUANT
#define WAKENET_MODEL esp_sr_wakenet7_quantized
#elif CONFIG_SR_MODEL_WN7_QUANT8
#define WAKENET_MODEL esp_sr_wakenet7_quantized8
#else
#error No valid neural network model selected.
#endif
@ -30,19 +24,7 @@ extern const esp_wn_iface_t esp_sr_wakenet6_quantized;
/*
Configure wake word to use based on what's selected in menuconfig.
*/
#if CONFIG_SR_WN3_HILEXIN
#include "hilexin_wn3.h"
#define WAKENET_COEFF get_coeff_hilexin_wn3
#elif CONFIG_SR_WN4_HILEXIN
#include "hilexin_wn4.h"
#define WAKENET_COEFF get_coeff_hilexin_wn4
#elif CONFIG_SR_WN5_HILEXIN & CONFIG_SR_MODEL_WN5_FLOAT
#include "hilexin_wn5_float.h"
#define WAKENET_COEFF get_coeff_hilexin_wn5_float
#elif CONFIG_SR_WN5_HILEXIN & CONFIG_SR_MODEL_WN5_QUANT
#if CONFIG_SR_WN5_HILEXIN & CONFIG_SR_MODEL_WN5_QUANT
#include "hilexin_wn5.h"
#define WAKENET_COEFF get_coeff_hilexin_wn5
@ -74,17 +56,37 @@ extern const esp_wn_iface_t esp_sr_wakenet6_quantized;
#include "hijeson_wn5X3.h"
#define WAKENET_COEFF get_coeff_hijeson_wn5X3
#elif CONFIG_SR_WN6_NIHAOXIAOXIN
#include "nihaoxiaoxin_wn6.h"
#define WAKENET_COEFF get_coeff_nihaoxiaoxin_wn6
#elif CONFIG_SR_WN5_CUSTOMIZED_WORD
#include "customized_word_wn5.h"
#define WAKENET_COEFF get_coeff_customized_word_wn5
#elif CONFIG_SR_WN6_CUSTOMIZED_WORD
#include "customized_word_wn6.h"
#define WAKENET_COEFF get_coeff_customized_word_wn6
#elif CONFIG_SR_WN7_CUSTOMIZED_WORD
#include "customized_word_wn7.h"
#define WAKENET_COEFF get_coeff_customized_word_wn7
#elif CONFIG_SR_WN7_XIAOAITONGXUE & CONFIG_SR_MODEL_WN7_QUANT
#include "xiaoaitongxue_wn7.h"
#define WAKENET_COEFF get_coeff_xiaoaitongxue_wn7
#elif CONFIG_SR_WN7_XIAOAITONGXUE & CONFIG_SR_MODEL_WN7_QUANT8
#include "xiaoaitongxue_wn7_q8.h"
#define WAKENET_COEFF get_coeff_xiaoaitongxue_wn7_q8
#elif CONFIG_SR_WN7_HILEXIN & CONFIG_SR_MODEL_WN7_QUANT
#include "hilexin_wn7.h"
#define WAKENET_COEFF get_coeff_hilexin_wn7
#elif CONFIG_SR_WN7_HILEXIN & CONFIG_SR_MODEL_WN7_QUANT8
#include "hilexin_wn7_q8.h"
#define WAKENET_COEFF get_coeff_hilexin_wn7_q8
#elif CONFIG_SR_WN7_ALEXA & CONFIG_SR_MODEL_WN7_QUANT
#include "alexa_wn7.h"
#define WAKENET_COEFF get_coeff_alexa_wn7
#elif CONFIG_SR_WN7_ALEXA & CONFIG_SR_MODEL_WN7_QUANT8
#include "alexa_wn7_q8.h"
#define WAKENET_COEFF get_coeff_alexa_wn7_q8
#else
#error No valid wake word selected.

Binary file not shown.