mirror of
https://github.com/espressif/esp-sr.git
synced 2025-09-15 15:28:44 +08:00
feature/add AFE component & update wakenet
This commit is contained in:
parent
c5896943ea
commit
d289680270
@ -1,5 +1,11 @@
|
||||
# Change log for esp-sr
|
||||
|
||||
## 0.8.0
|
||||
support ESP32S3 chip
|
||||
add wakenet7 & update wakenet5 to support multi-channel detection
|
||||
remove wakenet6
|
||||
add AFE pipeline for speech recognition
|
||||
|
||||
## 0.7.0
|
||||
add chinese tts
|
||||
update noise suppression v2
|
||||
|
||||
@ -8,6 +8,7 @@ set(COMPONENT_ADD_INCLUDEDIRS
|
||||
speech_command_recognition/include
|
||||
acoustic_algorithm/include
|
||||
esp-tts/esp_tts_chinese/include
|
||||
audio_front_end/include
|
||||
)
|
||||
|
||||
|
||||
@ -18,6 +19,7 @@ target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/wake_w
|
||||
target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/speech_command_recognition")
|
||||
target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/acoustic_algorithm")
|
||||
target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese")
|
||||
target_link_libraries(${COMPONENT_TARGET} "-L ${CMAKE_CURRENT_SOURCE_DIR}/audio_front_end")
|
||||
|
||||
IF (IDF_VER MATCHES "v4.")
|
||||
add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/libmultinet.a" PRIV_REQUIRES esp-sr)
|
||||
@ -28,7 +30,7 @@ ENDIF (IDF_VER MATCHES "v4.")
|
||||
if(IDF_TARGET STREQUAL "esp32")
|
||||
target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
|
||||
wakenet
|
||||
dl_lib
|
||||
dl_lib_esp32
|
||||
c_speech_features
|
||||
hilexin_wn3
|
||||
hilexin_wn4
|
||||
@ -59,3 +61,15 @@ target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
|
||||
voice_set_template_esp32s2
|
||||
"-Wl,--end-group")
|
||||
endif()
|
||||
|
||||
|
||||
if(IDF_TARGET STREQUAL "esp32s3beta")
|
||||
target_link_libraries(${COMPONENT_TARGET} "-Wl,--start-group"
|
||||
wakenet
|
||||
dl_lib_esp32s3
|
||||
c_speech_features
|
||||
wakeword_model
|
||||
customized_word_wn5
|
||||
esp_audio_front_end
|
||||
"-Wl,--end-group")
|
||||
endif()
|
||||
|
||||
Binary file not shown.
11
audio_front_end/component.mk
Normal file
11
audio_front_end/component.mk
Normal file
@ -0,0 +1,11 @@
|
||||
COMPONENT_ADD_INCLUDEDIRS := include
|
||||
|
||||
COMPONENT_SRCDIRS := .
|
||||
|
||||
LIB_FILES := $(shell ls $(COMPONENT_PATH)/lib*.a)
|
||||
|
||||
LIBS := $(patsubst lib%.a,-l%,$(notdir $(LIB_FILES)))
|
||||
|
||||
COMPONENT_ADD_LDFLAGS += -L$(COMPONENT_PATH)/ $(LIBS)
|
||||
|
||||
ALL_LIB_FILES += $(LIB_FILES)
|
||||
150
audio_front_end/include/esp_afe_sr_iface.h
Normal file
150
audio_front_end/include/esp_afe_sr_iface.h
Normal file
@ -0,0 +1,150 @@
|
||||
#pragma once
|
||||
#include "stdint.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#include "esp_wn_models.h"
|
||||
|
||||
//AFE: Audio Front-End
|
||||
//SR: Speech Recognition
|
||||
//afe_sr/AFE_SR: the audio front-end for speech recognition
|
||||
|
||||
//Opaque AFE_SR data container
|
||||
typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
|
||||
|
||||
//Set AFE_SR mode
|
||||
typedef enum {
|
||||
SR_MODE_LOW_COST = 0, //LOW_COST, low memory consumption and CPU loading
|
||||
SR_MODE_MEDIUM = 1, //MEDIUM
|
||||
SR_MODE_HIGH_PERF = 2, //HIGH_PERF
|
||||
} afe_sr_mode_t;
|
||||
|
||||
/**
|
||||
* @brief Function to initialze a AFE_SR instance with a specified mode
|
||||
*
|
||||
* @param mode The mode of AFE_SR
|
||||
* @param perferred_core The perferred core to be pinned.
|
||||
* If all task in AFE_SR can not run in real time by only one core, the another core would be used.
|
||||
* @returns Handle to the AFE_SR data
|
||||
*/
|
||||
typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_t)(afe_sr_mode_t mode, int perferred_core);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of each channel samples per frame that need to be passed to the function
|
||||
*
|
||||
* Every speech enhancement AFE_SR processes a certain number of samples at the same time. This function
|
||||
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of samples to feed the fetch function
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_frame_chunksize_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the channel number of samples that need to be passed to the fetch function
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of samples to feed the fetch function
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the function
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the AFE_SR
|
||||
*
|
||||
* @Warning The input data should be arranged in the format of [CH0_0, CH1_0, ..., CHN_0, CH0_1, CH0_1, ..., CHN_1, ...].
|
||||
* The last channel is reference signal or far-end signal.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
|
||||
* `get_frame_chunksize`. The channel number can be queried `get_channel_num`.
|
||||
* @return The size of input
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in);
|
||||
|
||||
/**
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR
|
||||
*
|
||||
* @Warning The output is single channel data, no matter how many channels the input is.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param out The output enhanced signal. The frame size can be queried by the `get_frame_chunksize`.
|
||||
* @return The style of output, -1: noise, 0: speech, 1: wake word 1, 2: wake word 2, ...
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe, int16_t* out);
|
||||
|
||||
/**
|
||||
* @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
|
||||
* when wakenet has been initialized.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param wakenet The pointer of wakenet
|
||||
* @param model_coeff The coefficient of wake word model
|
||||
* @return 0: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe,
|
||||
esp_wn_iface_t *wakenet,
|
||||
const model_coeff_getter_t *model_coeff);
|
||||
|
||||
/**
|
||||
* @brief Disable wakenet model.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return 0: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_disable_wakenet_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Enable wakenet model.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return 0: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_enable_wakenet_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Disable AEC algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return 0: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_disable_aec_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Enable AEC algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return 0: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_enable_aec_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Destroy a AFE_SR instance
|
||||
*
|
||||
* @param afe AFE_SR object to destroy
|
||||
*/
|
||||
typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a AFE_SR.
|
||||
*/
|
||||
typedef struct {
|
||||
esp_afe_sr_iface_op_create_t create;
|
||||
esp_afe_sr_iface_op_feed_t feed;
|
||||
esp_afe_sr_iface_op_fetch_t fetch;
|
||||
esp_afe_sr_iface_op_get_frame_chunksize_t get_frame_chunksize;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_channel_num;
|
||||
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
|
||||
esp_afe_sr_iface_op_disable_wakenet_t disable_wakenet;
|
||||
esp_afe_sr_iface_op_enable_wakenet_t enable_wakenet;
|
||||
esp_afe_sr_iface_op_disable_aec_t disable_aec;
|
||||
esp_afe_sr_iface_op_enable_aec_t enable_aec;
|
||||
esp_afe_sr_iface_op_destroy_t destroy;
|
||||
} esp_afe_sr_iface_t;
|
||||
6
audio_front_end/include/esp_afe_sr_models.h
Normal file
6
audio_front_end/include/esp_afe_sr_models.h
Normal file
@ -0,0 +1,6 @@
|
||||
#pragma once
|
||||
#include "esp_afe_sr_iface.h"
|
||||
|
||||
extern const esp_afe_sr_iface_t esp_afe_sr_2mic;
|
||||
extern const esp_afe_sr_iface_t esp_afe_sr_1mic;
|
||||
|
||||
BIN
audio_front_end/libesp_audio_front_end.a
Normal file
BIN
audio_front_end/libesp_audio_front_end.a
Normal file
Binary file not shown.
Binary file not shown.
BIN
lib/libdl_lib_esp32s3.a
Normal file
BIN
lib/libdl_lib_esp32s3.a
Normal file
Binary file not shown.
BIN
lib/libwakenet.a
BIN
lib/libwakenet.a
Binary file not shown.
@ -10,7 +10,11 @@ typedef struct model_iface_data_t model_iface_data_t;
|
||||
//As a consequence also the false alarm rate goes up
|
||||
typedef enum {
|
||||
DET_MODE_90 = 0, //Normal, response accuracy rate about 90%
|
||||
DET_MODE_95 //Aggressive, response accuracy rate about 95%
|
||||
DET_MODE_95 = 1, //Aggressive, response accuracy rate about 95%
|
||||
DET_MODE_2CH_90 = 2,
|
||||
DET_MODE_2CH_95 = 3,
|
||||
DET_MODE_3CH_90 = 4,
|
||||
DET_MODE_3CH_95 = 5,
|
||||
} det_mode_t;
|
||||
|
||||
typedef struct {
|
||||
@ -27,18 +31,6 @@ typedef struct {
|
||||
*/
|
||||
typedef model_iface_data_t* (*esp_wn_iface_op_create_t)(const model_coeff_getter_t *model_coeff, det_mode_t det_mode);
|
||||
|
||||
/**
|
||||
* @brief Function type to initialze a model instance with a detection mode and specified wake word coefficient
|
||||
*
|
||||
* Warning: Just wakeNet6 support this function to select which core to run neural network.
|
||||
*
|
||||
* @param det_mode The wake words detection mode to trigger wake words, DET_MODE_90 or DET_MODE_95
|
||||
* @param model_coeff The specified wake word model coefficient
|
||||
* @param core Core to run neural network
|
||||
* @returns Handle to the model data
|
||||
*/
|
||||
typedef model_iface_data_t* (*esp_wn_iface_op_create_pinned_to_core_t)(const model_coeff_getter_t *model_coeff, det_mode_t det_mode, int core);
|
||||
|
||||
/**
|
||||
* @brief Callback function type to fetch the amount of samples that need to be passed to the detect function
|
||||
*
|
||||
@ -50,6 +42,17 @@ typedef model_iface_data_t* (*esp_wn_iface_op_create_pinned_to_core_t)(const mod
|
||||
*/
|
||||
typedef int (*esp_wn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Callback function type to fetch the channel number of samples that need to be passed to the detect function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the same time. This function
|
||||
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_wn_iface_op_get_channel_num_t)(model_iface_data_t *model);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the detect function
|
||||
@ -109,6 +112,23 @@ typedef float (*esp_wn_iface_op_get_det_threshold_t)(model_iface_data_t *model,
|
||||
*/
|
||||
typedef int (*esp_wn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
|
||||
|
||||
/**
|
||||
* @brief Get the volume gain
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param target_db The target dB to calculate volume gain
|
||||
* @returns the volume gain
|
||||
*/
|
||||
typedef float (*esp_wn_iface_op_get_vol_gain_t)(model_iface_data_t *model, float target_db);
|
||||
|
||||
/**
|
||||
* @brief Get the triggered channel index. Channel index starts from zero
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The channel index
|
||||
*/
|
||||
typedef int (*esp_wn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Destroy a speech recognition model
|
||||
*
|
||||
@ -122,13 +142,15 @@ typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model);
|
||||
*/
|
||||
typedef struct {
|
||||
esp_wn_iface_op_create_t create;
|
||||
esp_wn_iface_op_create_pinned_to_core_t create_pinned_to_core;
|
||||
esp_wn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
||||
esp_wn_iface_op_get_channel_num_t get_channel_num;
|
||||
esp_wn_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_wn_iface_op_get_word_num_t get_word_num;
|
||||
esp_wn_iface_op_get_word_name_t get_word_name;
|
||||
esp_wn_iface_op_set_det_threshold_t set_det_threshold;
|
||||
esp_wn_iface_op_get_det_threshold_t get_det_threshold;
|
||||
esp_wn_iface_op_get_triggered_channel_t get_triggered_channel;
|
||||
esp_wn_iface_op_get_vol_gain_t get_vol_gain;
|
||||
esp_wn_iface_op_detect_t detect;
|
||||
esp_wn_iface_op_destroy_t destroy;
|
||||
} esp_wn_iface_t;
|
||||
|
||||
@ -4,25 +4,19 @@
|
||||
//Contains declarations of all available speech recognion models. Pair this up with the right coefficients and you have a model that can recognize
|
||||
//a specific phrase or word.
|
||||
|
||||
extern const esp_wn_iface_t esp_sr_wakenet3_quantized;
|
||||
extern const esp_wn_iface_t esp_sr_wakenet4_quantized;
|
||||
extern const esp_wn_iface_t esp_sr_wakenet5_quantized;
|
||||
extern const esp_wn_iface_t esp_sr_wakenet5_float;
|
||||
extern const esp_wn_iface_t esp_sr_wakenet6_quantized;
|
||||
extern const esp_wn_iface_t esp_sr_wakenet7_quantized;
|
||||
extern const esp_wn_iface_t esp_sr_wakenet7_quantized8;
|
||||
|
||||
/*
|
||||
Configure network to use based on what's selected in menuconfig.
|
||||
*/
|
||||
#if CONFIG_SR_MODEL_WN3_QUANT
|
||||
#define WAKENET_MODEL esp_sr_wakenet3_quantized
|
||||
#elif CONFIG_SR_MODEL_WN4_QUANT
|
||||
#define WAKENET_MODEL esp_sr_wakenet4_quantized
|
||||
#elif CONFIG_SR_MODEL_WN5_FLOAT
|
||||
#define WAKENET_MODEL esp_sr_wakenet5_float
|
||||
#elif CONFIG_SR_MODEL_WN5_QUANT
|
||||
#if CONFIG_SR_MODEL_WN5_QUANT
|
||||
#define WAKENET_MODEL esp_sr_wakenet5_quantized
|
||||
#elif CONFIG_SR_MODEL_WN6_QUANT
|
||||
#define WAKENET_MODEL esp_sr_wakenet6_quantized
|
||||
#elif CONFIG_SR_MODEL_WN7_QUANT
|
||||
#define WAKENET_MODEL esp_sr_wakenet7_quantized
|
||||
#elif CONFIG_SR_MODEL_WN7_QUANT8
|
||||
#define WAKENET_MODEL esp_sr_wakenet7_quantized8
|
||||
#else
|
||||
#error No valid neural network model selected.
|
||||
#endif
|
||||
@ -30,19 +24,7 @@ extern const esp_wn_iface_t esp_sr_wakenet6_quantized;
|
||||
/*
|
||||
Configure wake word to use based on what's selected in menuconfig.
|
||||
*/
|
||||
#if CONFIG_SR_WN3_HILEXIN
|
||||
#include "hilexin_wn3.h"
|
||||
#define WAKENET_COEFF get_coeff_hilexin_wn3
|
||||
|
||||
#elif CONFIG_SR_WN4_HILEXIN
|
||||
#include "hilexin_wn4.h"
|
||||
#define WAKENET_COEFF get_coeff_hilexin_wn4
|
||||
|
||||
#elif CONFIG_SR_WN5_HILEXIN & CONFIG_SR_MODEL_WN5_FLOAT
|
||||
#include "hilexin_wn5_float.h"
|
||||
#define WAKENET_COEFF get_coeff_hilexin_wn5_float
|
||||
|
||||
#elif CONFIG_SR_WN5_HILEXIN & CONFIG_SR_MODEL_WN5_QUANT
|
||||
#if CONFIG_SR_WN5_HILEXIN & CONFIG_SR_MODEL_WN5_QUANT
|
||||
#include "hilexin_wn5.h"
|
||||
#define WAKENET_COEFF get_coeff_hilexin_wn5
|
||||
|
||||
@ -74,17 +56,37 @@ extern const esp_wn_iface_t esp_sr_wakenet6_quantized;
|
||||
#include "hijeson_wn5X3.h"
|
||||
#define WAKENET_COEFF get_coeff_hijeson_wn5X3
|
||||
|
||||
#elif CONFIG_SR_WN6_NIHAOXIAOXIN
|
||||
#include "nihaoxiaoxin_wn6.h"
|
||||
#define WAKENET_COEFF get_coeff_nihaoxiaoxin_wn6
|
||||
|
||||
#elif CONFIG_SR_WN5_CUSTOMIZED_WORD
|
||||
#include "customized_word_wn5.h"
|
||||
#define WAKENET_COEFF get_coeff_customized_word_wn5
|
||||
|
||||
#elif CONFIG_SR_WN6_CUSTOMIZED_WORD
|
||||
#include "customized_word_wn6.h"
|
||||
#define WAKENET_COEFF get_coeff_customized_word_wn6
|
||||
#elif CONFIG_SR_WN7_CUSTOMIZED_WORD
|
||||
#include "customized_word_wn7.h"
|
||||
#define WAKENET_COEFF get_coeff_customized_word_wn7
|
||||
|
||||
#elif CONFIG_SR_WN7_XIAOAITONGXUE & CONFIG_SR_MODEL_WN7_QUANT
|
||||
#include "xiaoaitongxue_wn7.h"
|
||||
#define WAKENET_COEFF get_coeff_xiaoaitongxue_wn7
|
||||
|
||||
#elif CONFIG_SR_WN7_XIAOAITONGXUE & CONFIG_SR_MODEL_WN7_QUANT8
|
||||
#include "xiaoaitongxue_wn7_q8.h"
|
||||
#define WAKENET_COEFF get_coeff_xiaoaitongxue_wn7_q8
|
||||
|
||||
#elif CONFIG_SR_WN7_HILEXIN & CONFIG_SR_MODEL_WN7_QUANT
|
||||
#include "hilexin_wn7.h"
|
||||
#define WAKENET_COEFF get_coeff_hilexin_wn7
|
||||
|
||||
#elif CONFIG_SR_WN7_HILEXIN & CONFIG_SR_MODEL_WN7_QUANT8
|
||||
#include "hilexin_wn7_q8.h"
|
||||
#define WAKENET_COEFF get_coeff_hilexin_wn7_q8
|
||||
|
||||
#elif CONFIG_SR_WN7_ALEXA & CONFIG_SR_MODEL_WN7_QUANT
|
||||
#include "alexa_wn7.h"
|
||||
#define WAKENET_COEFF get_coeff_alexa_wn7
|
||||
|
||||
#elif CONFIG_SR_WN7_ALEXA & CONFIG_SR_MODEL_WN7_QUANT8
|
||||
#include "alexa_wn7_q8.h"
|
||||
#define WAKENET_COEFF get_coeff_alexa_wn7_q8
|
||||
|
||||
#else
|
||||
#error No valid wake word selected.
|
||||
|
||||
BIN
wake_word_engine/libwakeword_model.a
Normal file
BIN
wake_word_engine/libwakeword_model.a
Normal file
Binary file not shown.
Loading…
Reference in New Issue
Block a user