esp-sr/include/esp32s2/esp_afe_config.h

289 lines
12 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#pragma once
#include "esp_aec.h"
#include "esp_agc.h"
#include "esp_nsn_models.h"
#include "esp_vad.h"
#include "esp_vadn_models.h"
#include "esp_wn_iface.h"
#include "esp_wn_models.h"
#include "model_path.h"
#include "stdbool.h"
#include "stdint.h"
#include "stdlib.h"
#ifdef __cplusplus
extern "C" {
#endif
// AFE: Audio Front-End
// SR: Speech Recognition
// VC: Voice Communication
// Set AFE_SR mode
typedef enum {
SR_MODE_LOW_COST = 0, // Deprecated, please use afe_mode_t, AFE mode: low cost mode
SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode
} afe_sr_mode_t;
// Set AFE mode
typedef enum {
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
} afe_mode_t;
// Set AFE type
typedef enum {
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
AFE_TYPE_VC = 1, // Voice communication scenarios, 16KHz input, including nonlinear noise suppression
AFE_TYPE_VC_8K = 2, // Voice communication scenarios, 8KHz input, note that the input data must be 8KHz
} afe_type_t;
typedef enum {
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram
} afe_memory_alloc_mode_t;
typedef enum {
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
} afe_mn_peak_agc_mode_t;
typedef struct {
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
int mic_num; // microphone channel number
uint8_t *mic_ids; // microphone channel indices
int ref_num; // playback reference channel number
uint8_t *ref_ids; // playback reference channel indices
int sample_rate; // sample rate of audio
} afe_pcm_config_t;
typedef enum {
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
AFE_NS_MODE_NET = 1, // please use model name of NSNET
} afe_ns_mode_t;
typedef enum {
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
} afe_agc_mode_t;
/**
* @brief Function to get the debug audio data
*
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that
* avoid blocking for too long.
* @param data_size The number of bytes of data.
* @returns
*/
typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);
typedef enum {
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
AFE_DEBUG_HOOK_MAX = 2
} afe_debug_hook_type_t;
typedef struct {
afe_debug_hook_type_t hook_type; // debug type of hook
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
} afe_debug_hook_t;
typedef struct {
/********** AEC(Acoustic Echo Cancellation) **********/
bool aec_init; // Whether to init aec
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
int aec_filter_length; // The filter length of aec
/********** SE(Speech Enhancement, microphone array processing) **********/
bool se_init; // Whether to init se
/********** NS(Noise Suppression) **********/
bool ns_init; // Whether to init ns
char *ns_model_name; // Model name of ns
afe_ns_mode_t afe_ns_mode; // Model mode of ns
/********** VAD(Voice Activity Detection) **********/
bool vad_init; // Whether to init vad
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
// 1000 ms
int vad_delay_ms; // The delay of the first speech frame in ms, default: 128 ms
// If you find vad cache can not cover all speech, please increase this value.
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
/********** WakeNet(Wake Word Engine) **********/
bool wakenet_init;
char *wakenet_model_name; // The model name of wakenet 1
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
det_mode_t wakenet_mode; // The mode of wakenet
/********** AGC(Automatic Gain Control) **********/
bool agc_init; // Whether to init agc
afe_agc_mode_t
agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
int agc_compression_gain_db; // Compression gain in dB (default 9)
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default 3, means target level is -3 dBFS)
/********** General AFE(Audio Front End) parameter **********/
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
afe_mode_t afe_mode; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
afe_type_t afe_type; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
// directly on the output amplitude: out_linear_gain * amplitude.
bool debug_init;
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
// otherwise, select channel number by wakenet
} afe_config_t;
/**
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
* on the chip target and input format. You can manually fine-tune it after creating the configuration
*
* The input format:
* M to represent the microphone channel
* R to represent the playback reference channel
* N to represent an unknown or unused channel
*
* For example, input_format="MMNR" indicates that the input data consists of four channels,
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
*
* @param input_format The input format
* @param models Models from partition, which is configured by Kconfig
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
*
* @return afe_config_t* The default config of afe
*/
afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
/**
* @brief Check AFE configuration and make sure it is correct.
*
* @warning If there is a configuration conflict, this function will modify some parameters.
* The guiding behind these modifications is to maintain the highest performance of the output audio and results.
* And remove the conflict between different algorithms.
*
* For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
* If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
*
* @param afe_config Input AFE config
*
* @return afe_config_t* The modified AFE config
*/
afe_config_t *afe_config_check(afe_config_t *afe_config);
/**
* @brief Parse input format
*
* @param input_format The input format, same with afe_config_init() function
* @param pcm_config The pcm config
*
* @return true if the input format is parsed successfully, otherwise false
*/
bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);
/**
* @brief Parse I2S input data
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param mic_data The output microphone data
* @param ref_data The output playback reference data
* @param pcm_config The pcm config
*
*/
void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);
/**
* @brief Parse input data, from interleaved arrangement to contiguous arrangement
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param channel_num The channel number of data
* @param out_data The output data
*
*/
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
/**
* @brief Format input data, from contiguous arrangement to interleaved arrangement
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param channel_num The channel number of data
* @param out_data The output data
*
*/
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
/**
* @brief Adjust the gain of input data
*
* @warning the input data will be modified inplace.
*
* @param data The input audio data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param factor The gain factor
*
* @return int16_t* The output audio data
*/
int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);
/**
* @brief Adjust the gain of input data
*
* @warning the input data will be modified inplace.
*
* @param in_data The input audio data
* @param in_frame_size Input data frame size of input
* @param channel_num The channel number of input data, which is same as output data
* @param out_data The output audio data
* @param out_frame_size Onput data frame size of input
*
*/
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);
/**
* @brief Copy the afe config
*
* @param dst_config The destination afe config
* @param src_config The source afe config
*
* @return The destination afe config
*/
afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
/**
* @brief Print the afe config
*
* @param afe_config The afe config
*/
void afe_config_print(const afe_config_t *afe_config);
/**
* @brief Allocate afe config
*
* @return The afe config pointer
*/
afe_config_t *afe_config_alloc();
/**
* @brief Free afe config
*
* @param afe_config The afe config pointer
*/
void afe_config_free(afe_config_t *afe_config);
#ifdef __cplusplus
}
#endif