Merge branch 'feat/add_mfcc_interface' into 'master'

feat: add mfcc interface

See merge request speech-recognition-framework/esp-sr!134
This commit is contained in:
Sun Xiang Yu 2025-02-07 11:37:41 +08:00
commit 2993ce18ce
41 changed files with 1234 additions and 498 deletions

View File

@ -0,0 +1,29 @@
#pragma once
#include <float.h>
#include <math.h>
/* #undef ENABLE_DOUBLE */
#ifdef ENABLE_DOUBLE
# define csf_float double
# define csf_ceil ceil
# define csf_floor floor
# define csf_sin sin
# define csf_log log
# define csf_log10 log10
# define csf_pow pow
# define csf_sqrt sqrt
# define csf_abs fabs
# define csf_float_min DBL_MIN
#else
# define csf_float float
# define csf_ceil ceilf
# define csf_floor floorf
# define csf_sin sinf
# define csf_log logf
# define csf_log10 log10f
# define csf_pow powf
# define csf_sqrt sqrtf
# define csf_abs fabsf
# define csf_float_min FLT_MIN
#endif

View File

@ -1,241 +1,245 @@
#pragma once
#include "stdint.h"
#include "stdbool.h"
#include "stdlib.h"
#include "esp_wn_iface.h"
#include "esp_wn_models.h"
#include "esp_vad.h"
#include "esp_aec.h"
#include "esp_agc.h"
#include "model_path.h"
#include "esp_vadn_models.h"
#include "esp_nsn_models.h"
#include "esp_vad.h"
#include "esp_vadn_models.h"
#include "esp_wn_iface.h"
#include "esp_wn_models.h"
#include "model_path.h"
#include "stdbool.h"
#include "stdint.h"
#include "stdlib.h"
#ifdef __cplusplus
extern "C" {
#endif
//AFE: Audio Front-End
//SR: Speech Recognition
//VC: Voice Communication
// AFE: Audio Front-End
// SR: Speech Recognition
// VC: Voice Communication
//Set AFE_SR mode
// Set AFE_SR mode
typedef enum {
SR_MODE_LOW_COST = 0, //Deprecated, please use afe_mode_t, AFE mode: low cost mode
SR_MODE_HIGH_PERF = 1, //Deprecated, please use afe_mode_t, AFE mode: high performance mode
SR_MODE_LOW_COST = 0, // Deprecated, please use afe_mode_t, AFE mode: low cost mode
SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode
} afe_sr_mode_t;
//Set AFE mode
// Set AFE mode
typedef enum {
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
} afe_mode_t;
//Set AFE type
// Set AFE type
typedef enum {
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
} afe_type_t;
typedef enum {
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram
} afe_memory_alloc_mode_t;
typedef enum {
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
} afe_mn_peak_agc_mode_t;
typedef struct {
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
int mic_num; // microphone channel number
uint8_t* mic_ids; // microphone channel indices
int ref_num; // playback reference channel number
uint8_t* ref_ids; // playback reference channel indices
int sample_rate; // sample rate of audio
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
int mic_num; // microphone channel number
uint8_t *mic_ids; // microphone channel indices
int ref_num; // playback reference channel number
uint8_t *ref_ids; // playback reference channel indices
int sample_rate; // sample rate of audio
} afe_pcm_config_t;
typedef enum {
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
AFE_NS_MODE_NET = 1, // please use model name of NSNET
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
AFE_NS_MODE_NET = 1, // please use model name of NSNET
} afe_ns_mode_t;
typedef enum {
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
} afe_agc_mode_t;
/**
* @brief Function to get the debug audio data
*
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that avoid blocking for too long.
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that
* avoid blocking for too long.
* @param data_size The number of bytes of data.
* @returns
*/
typedef void (*afe_debug_hook_callback_t)(const int16_t* data, int data_size);
typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);
typedef enum {
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
AFE_DEBUG_HOOK_MAX = 2
} afe_debug_hook_type_t;
typedef struct {
afe_debug_hook_type_t hook_type; // debug type of hook
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
afe_debug_hook_type_t hook_type; // debug type of hook
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
} afe_debug_hook_t;
typedef struct {
/********** AEC(Acoustic Echo Cancellation) **********/
bool aec_init; // Whether to init aec
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
int aec_filter_length; // The filter length of aec
bool aec_init; // Whether to init aec
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
int aec_filter_length; // The filter length of aec
/********** SE(Speech Enhancement, microphone array processing) **********/
bool se_init; // Whether to init se
bool se_init; // Whether to init se
/********** NS(Noise Suppression) **********/
bool ns_init; // Whether to init ns
char *ns_model_name; // Model name of ns
afe_ns_mode_t afe_ns_mode; // Model mode of ns
bool ns_init; // Whether to init ns
char *ns_model_name; // Model name of ns
afe_ns_mode_t afe_ns_mode; // Model mode of ns
/********** VAD(Voice Activity Detection) **********/
bool vad_init; // Whether to init vad
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
bool vad_init; // Whether to init vad
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
// 1000 ms
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
/********** WakeNet(Wake Word Engine) **********/
bool wakenet_init;
char *wakenet_model_name; // The model name of wakenet 1
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
det_mode_t wakenet_mode; // The mode of wakenet
char *wakenet_model_name; // The model name of wakenet 1
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
det_mode_t wakenet_mode; // The mode of wakenet
/********** AGC(Automatic Gain Control) **********/
bool agc_init; // Whether to init agc
afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
int agc_compression_gain_db; // Compression gain in dB (default 9)
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
bool agc_init; // Whether to init agc
afe_agc_mode_t
agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
int agc_compression_gain_db; // Compression gain in dB (default 9)
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
/********** General AFE(Audio Front End) parameter **********/
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
afe_mode_t afe_mode; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
afe_type_t afe_type; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude.
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
afe_mode_t afe_mode; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
afe_type_t afe_type; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
// directly on the output amplitude: out_linear_gain * amplitude.
bool debug_init;
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
// otherwise, select channel number by wakenet
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
// otherwise, select channel number by wakenet
} afe_config_t;
/**
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format.
* You can manually fine-tune it after creating the configuration
*
* The input format:
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
* on the chip target and input format. You can manually fine-tune it after creating the configuration
*
* The input format:
* M to represent the microphone channel
* R to represent the playback reference channel
* N to represent an unknown or unused channel
*
* For example, input_format="MMNR" indicates that the input data consists of four channels,
*
* For example, input_format="MMNR" indicates that the input data consists of four channels,
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
*
*
* @param input_format The input format
* @param models Models from partition, which is configured by Kconfig
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
*
*
* @return afe_config_t* The default config of afe
*/
afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
/**
* @brief Check AFE configuration and make sure it is correct.
*
* @warning If there is a configuration conflict, this function will modify some parameters.
*
* @warning If there is a configuration conflict, this function will modify some parameters.
* The guiding behind these modifications is to maintain the highest performance of the output audio and results.
* And remove the conflict between different algorithms.
*
*
* For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
* If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
*
*
* @param afe_config Input AFE config
*
*
* @return afe_config_t* The modified AFE config
*/
afe_config_t *afe_config_check(afe_config_t *afe_config);
/**
* @brief Parse input format
*
*
* @param input_format The input format, same with afe_config_init() function
* @param pcm_config The pcm config
*
*
* @return true if the input format is parsed successfully, otherwise false
*/
bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config);
bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);
/**
* @brief Parse I2S input data
*
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param mic_data The output microphone data
* @param ref_data The output playback reference data
* @param pcm_config The pcm config
*
*
*/
void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config);
void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);
/**
* @brief Parse input data, from interleaved arrangement to contiguous arrangement
*
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param channel_num The channel number of data
* @param out_data The output data
*
*
*/
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
/**
* @brief Format input data, from contiguous arrangement to interleaved arrangement
*
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param channel_num The channel number of data
* @param out_data The output data
*
*
*/
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
/**
* @brief Adjust the gain of input data
*
*
* @warning the input data will be modified inplace.
*
*
* @param data The input audio data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param factor The gain factor
*
*
* @return int16_t* The output audio data
*/
int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);
/**
* @brief Adjust the gain of input data
*
*
* @warning the input data will be modified inplace.
*
*
* @param in_data The input audio data
* @param in_frame_size Input data frame size of input
* @param channel_num The channel number of input data, which is same as output data
@ -243,35 +247,35 @@ int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
* @param out_frame_size Onput data frame size of input
*
*/
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size);
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);
/**
* @brief Copy the afe config
*
*
* @param dst_config The destination afe config
* @param src_config The source afe config
*
*
* @return The destination afe config
*/
afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
/**
* @brief Print the afe config
*
*
* @param afe_config The afe config
*/
void afe_config_print(const afe_config_t *afe_config);
/**
* @brief Allocate afe config
*
*
* @return The afe config pointer
*/
afe_config_t *afe_config_alloc();
/**
* @brief Free afe config
*
*
* @param afe_config The afe config pointer
*/
void afe_config_free(afe_config_t *afe_config);

View File

@ -1,62 +1,61 @@
#pragma once
#include "esp_afe_config.h"
#include "stdbool.h"
#include "stdint.h"
#include "stdlib.h"
#include "stdbool.h"
#include "esp_afe_config.h"
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#ifdef __cplusplus
extern "C" {
#endif
//AFE: Audio Front-End
//SR: Speech Recognition
//afe_sr/AFE_SR: the audio front-end for speech recognition
// AFE: Audio Front-End
// SR: Speech Recognition
// afe_sr/AFE_SR: the audio front-end for speech recognition
//Opaque AFE_SR data container
// Opaque AFE_SR data container
typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
/**
* @brief The state of vad
*/
typedef enum
{
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
typedef enum {
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
} afe_vad_state_t;
/**
* @brief The result of fetch function
*/
typedef struct afe_fetch_result_t
{
int16_t *data; // the target channel data of audio.
int data_size; // the size of data. The unit is byte.
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
int vad_cache_size; // the size of vad_cache. The unit is byte.
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc).
// if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length.
wakenet_state_t wakeup_state; // the value is wakenet_state_t
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
vad_state_t vad_state; // the value is afe_vad_state_t
int trigger_channel_id; // the channel index of output
int wake_word_length; // the length of wake word. The unit is the number of samples.
int ret_value; // the return state of fetch function
int16_t *raw_data; // the multi-channel output data of audio.
int raw_data_channels; // the channel number of raw data
void* reserved; // reserved for future use
typedef struct afe_fetch_result_t {
int16_t *data; // the target channel data of audio.
int data_size; // the size of data. The unit is byte.
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
// audio that was truncated.
int vad_cache_size; // the size of vad_cache. The unit is byte.
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
// (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
// wakenet(about 1.5s), otherwise is the frame length.
wakenet_state_t wakeup_state; // the value is wakenet_state_t
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
// start from 1.
vad_state_t vad_state; // the value is afe_vad_state_t
int trigger_channel_id; // the channel index of output
int wake_word_length; // the length of wake word. The unit is the number of samples.
int ret_value; // the return state of fetch function
int16_t *raw_data; // the multi-channel output data of audio.
int raw_data_channels; // the channel number of raw data
void *reserved; // reserved for future use
} afe_fetch_result_t;
/**
* @brief Function to initialze a AFE_SR instance
*
*
* @param afe_config The config of AFE_SR
* @returns Handle to the AFE_SR data
*/
typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
/**
* @brief Get the amount of each channel samples per frame that need to be passed to the function
@ -71,7 +70,7 @@ typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
/**
* @brief Get the channel number
*
*
* @param afe The AFE_SR object to query
* @return The amount of total channels
*/
@ -92,12 +91,12 @@ typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
* The last channel is reference signal if it has reference data.
*
* @param afe The AFE_SR object to query
*
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
*
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
* `get_feed_chunksize`.
* @return The size of input
*/
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in);
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);
/**
* @brief fetch enhanced samples of an audio stream from the AFE_SR
@ -106,9 +105,10 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t*
* Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
*
* @param afe The AFE_SR object to query
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
* audio can be queried by the `get_fetch_chunksize`.)
*/
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
/**
* @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
@ -117,9 +117,10 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *af
*
* @param afe The AFE_SR object to query
* @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result.
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
* audio can be queried by the `get_fetch_chunksize`.)
*/
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
/**
* @brief reset ringbuf of AFE.
@ -130,14 +131,14 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr
typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
/**
* @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
* @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
* when wakenet has been initialized. It's only support wakenet 1 now.
*
* @param afe The AFE_SR object to query
* @param wakenet_word The wakenet word, should be DEFAULT_WAKE_WORD or EXTRA_WAKE_WORD
* @return -1: fail, 1: success
*/
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name);
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name);
/**
* @brief Enable VAD algorithm.
@ -179,7 +180,6 @@ typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
*/
typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
/**
* This structure contains the functions used to do operations on a AFE_SR.
*/
@ -191,11 +191,11 @@ typedef struct {
esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
esp_afe_sr_iface_op_disable_func_t disable_wakenet;
esp_afe_sr_iface_op_enable_func_t enable_wakenet;
esp_afe_sr_iface_op_disable_func_t disable_aec;
@ -212,16 +212,14 @@ typedef struct {
esp_afe_sr_iface_op_destroy_t destroy;
} esp_afe_sr_iface_t;
// struct is used to store the AFE handle and data for the AFE task
typedef struct
{
typedef struct {
esp_afe_sr_data_t *afe_data;
esp_afe_sr_iface_t *afe_handle;
TaskHandle_t feed_task;
TaskHandle_t fetch_task;
}afe_task_into_t;
TaskHandle_t feed_task;
TaskHandle_t fetch_task;
} afe_task_into_t;
#ifdef __cplusplus
}
#endif
#endif

View File

@ -10,4 +10,4 @@ esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,89 @@
#pragma once
#include <stdint.h>
#include "esp_speech_features.h"
/*
This describes an interface for a MFCC runner, that is, some kind of implementation that can be
fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so
multiple implementations can be used.
*/
typedef struct esp_mfcc_data_t esp_mfcc_data_t;
//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please
//refer to its documentation for details.
typedef struct {
int winstep_ms; // The step between successive windows in ms. (10)
int winlen_ms; // The length of the analysis window in ms. (25)
int nch; // The number of input channel
int numcep; // The number of cepstrum to return
int nfilter; // The number of filters in the filterbank
int nfft; // The FFT size
int samp_freq; // The sample-rate of the signal.
int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0)
int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
char *win_type; // Analysis window type to apply to each frame "hanning","hamming","sine","rectangular","povey"
bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum
int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
float log_epsilon; // log epsilon. (e.g. 1e-7)
bool psram_first; // Alloc memory from PSRAM first
bool remove_dc_offset; // Whether to subtract mean of wave before FFT
} esp_mfcc_opts_t;
/**
* @brief Un-initialize and free a mfcc runner
*
* Function to free a previously allocated mfcc runner.
*
* @param r Runner object to destroy
*/
typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
/**
* @brief Initialize parameters for a mfcc runner.
*
* After creation, a mfcc runner needs to be initialized first; this is usually done
* in the initialization routine of a speech recognition algorithm. This provides
* a pointer to do this for a specific mfcc runner.
*
* @param opt Options for the mfcc process
* @return True if success, false on error.
*/
typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
/**
* @brief Run a mfcc iteration on frame by frame
*
* This will take a set of samples and return a ceptrum. Note that this may be pipelined:
* an initial call to this function may return NULL and subsequent calls may return the
* cepstrum of previous calls.
*
* @param r The mfcc runner
* @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000).
* @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function
* when done with this buffer. Note that some implementations require the buffer to be freed before another call
* to this function is done.
*/
typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
/**
* @brief Clean all state of mfcc handle
*
* @param r The mfcc runner
*/
typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r);
/**
* @brief Operations possible on a mfcc runner
*/
typedef struct {
esp_mfcc_op_destroy_t destroy;
esp_mfcc_op_create_t create;
esp_mfcc_op_run_step_t run_step;
esp_mfcc_op_clean_t clean;
} esp_mfcc_iface_t;

View File

@ -0,0 +1,40 @@
#pragma once
#include "esp_mfcc_iface.h"
extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle
/**
* @brief Return basic opts used in wakenet9 & multinet5
**/
esp_mfcc_opts_t *get_mfcc_opts_wn9();
/**
* @brief Return basic opts for default kaldifeat
*
opts->psram_first = true;
opts->use_power = true;
opts->use_log_fbank = 2; // log(max(x, log_epsilon))
opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps
opts->win_type = "povey";
opts->low_freq = 20;
opts->high_freq = 7600;
opts->samp_freq = 16000;
opts->nch = 1;
opts->nfft = 512;
opts->nfilter = 80;
opts->numcep = 80;
opts->preemph = 0.97;
opts->append_energy = false;
opts->winlen_ms = 25;
opts->winstep_ms = 10;
opts->remove_dc_offset = true;
*
**/
esp_mfcc_opts_t *get_mfcc_opts_kaldi();
/**
* @brief Print mfcc opts
**/
void print_mfcc_opts(esp_mfcc_opts_t *opts);

View File

@ -0,0 +1,64 @@
#pragma once
#include "c_speech_features_config.h"
#include "stdlib.h"
#include <assert.h>
#include <stdbool.h>
#ifndef M_2PI
#define M_2PI 6.283185307179586476925286766559005
#endif
typedef struct
{
float *coeff;
int *bank_pos;
int nfilter;
} esp_mel_filter_t;
float* esp_mfcc_malloc(size_t size, bool from_psram);
void esp_mfcc_free(void *ptr);
/**
* @brief Initialize FFT table
* @warning For ESP-PLATFORM, use esp-dsp fft
* For Other platform, use kiss fft
*
* @param nfft The input samples number
* @return fft-table
**/
void* esp_fft_init(int nfft);
/**
* @brief Free FFT table
* @warning For ESP-PLATFORM, use esp-dsp fft
* For Other platform, use kiss fft
*
* @param fft_table The fft table initialized by esp_fft_init
* @param nfft The input samples number
* @return fft-table
**/
void esp_fft_deinit(void *fft_table, int nfft);
/**
* @brief Initial window function
* Currently support hanning, hamming, sine, povey, rectangular,
* wn9(512-hanning to get wakenet9& multinet5 compatible)
**/
float *esp_win_func_init(char *win_type, float* window_data, int frame_length);
float* esp_fftr(float* x, int nfft, void *fft_table);
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);
esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq,
bool from_psram);
void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);
float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank,
float epsilon);

View File

@ -0,0 +1,29 @@
#pragma once
#include <float.h>
#include <math.h>
/* #undef ENABLE_DOUBLE */
#ifdef ENABLE_DOUBLE
# define csf_float double
# define csf_ceil ceil
# define csf_floor floor
# define csf_sin sin
# define csf_log log
# define csf_log10 log10
# define csf_pow pow
# define csf_sqrt sqrt
# define csf_abs fabs
# define csf_float_min DBL_MIN
#else
# define csf_float float
# define csf_ceil ceilf
# define csf_floor floorf
# define csf_sin sinf
# define csf_log logf
# define csf_log10 log10f
# define csf_pow powf
# define csf_sqrt sqrtf
# define csf_abs fabsf
# define csf_float_min FLT_MIN
#endif

View File

@ -1,241 +1,245 @@
#pragma once
#include "stdint.h"
#include "stdbool.h"
#include "stdlib.h"
#include "esp_wn_iface.h"
#include "esp_wn_models.h"
#include "esp_vad.h"
#include "esp_aec.h"
#include "esp_agc.h"
#include "model_path.h"
#include "esp_vadn_models.h"
#include "esp_nsn_models.h"
#include "esp_vad.h"
#include "esp_vadn_models.h"
#include "esp_wn_iface.h"
#include "esp_wn_models.h"
#include "model_path.h"
#include "stdbool.h"
#include "stdint.h"
#include "stdlib.h"
#ifdef __cplusplus
extern "C" {
#endif
//AFE: Audio Front-End
//SR: Speech Recognition
//VC: Voice Communication
// AFE: Audio Front-End
// SR: Speech Recognition
// VC: Voice Communication
//Set AFE_SR mode
// Set AFE_SR mode
typedef enum {
SR_MODE_LOW_COST = 0, //Deprecated, please use afe_mode_t, AFE mode: low cost mode
SR_MODE_HIGH_PERF = 1, //Deprecated, please use afe_mode_t, AFE mode: high performance mode
SR_MODE_LOW_COST = 0, // Deprecated, please use afe_mode_t, AFE mode: low cost mode
SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode
} afe_sr_mode_t;
//Set AFE mode
// Set AFE mode
typedef enum {
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
} afe_mode_t;
//Set AFE type
// Set AFE type
typedef enum {
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
} afe_type_t;
typedef enum {
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram
} afe_memory_alloc_mode_t;
typedef enum {
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
} afe_mn_peak_agc_mode_t;
typedef struct {
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
int mic_num; // microphone channel number
uint8_t* mic_ids; // microphone channel indices
int ref_num; // playback reference channel number
uint8_t* ref_ids; // playback reference channel indices
int sample_rate; // sample rate of audio
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
int mic_num; // microphone channel number
uint8_t *mic_ids; // microphone channel indices
int ref_num; // playback reference channel number
uint8_t *ref_ids; // playback reference channel indices
int sample_rate; // sample rate of audio
} afe_pcm_config_t;
typedef enum {
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
AFE_NS_MODE_NET = 1, // please use model name of NSNET
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
AFE_NS_MODE_NET = 1, // please use model name of NSNET
} afe_ns_mode_t;
typedef enum {
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
} afe_agc_mode_t;
/**
* @brief Function to get the debug audio data
*
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that avoid blocking for too long.
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that
* avoid blocking for too long.
* @param data_size The number of bytes of data.
* @returns
*/
typedef void (*afe_debug_hook_callback_t)(const int16_t* data, int data_size);
typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);
typedef enum {
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
AFE_DEBUG_HOOK_MAX = 2
} afe_debug_hook_type_t;
typedef struct {
afe_debug_hook_type_t hook_type; // debug type of hook
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
afe_debug_hook_type_t hook_type; // debug type of hook
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
} afe_debug_hook_t;
typedef struct {
/********** AEC(Acoustic Echo Cancellation) **********/
bool aec_init; // Whether to init aec
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
int aec_filter_length; // The filter length of aec
bool aec_init; // Whether to init aec
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
int aec_filter_length; // The filter length of aec
/********** SE(Speech Enhancement, microphone array processing) **********/
bool se_init; // Whether to init se
bool se_init; // Whether to init se
/********** NS(Noise Suppression) **********/
bool ns_init; // Whether to init ns
char *ns_model_name; // Model name of ns
afe_ns_mode_t afe_ns_mode; // Model mode of ns
bool ns_init; // Whether to init ns
char *ns_model_name; // Model name of ns
afe_ns_mode_t afe_ns_mode; // Model mode of ns
/********** VAD(Voice Activity Detection) **********/
bool vad_init; // Whether to init vad
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
bool vad_init; // Whether to init vad
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
// 1000 ms
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
/********** WakeNet(Wake Word Engine) **********/
bool wakenet_init;
char *wakenet_model_name; // The model name of wakenet 1
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
det_mode_t wakenet_mode; // The mode of wakenet
char *wakenet_model_name; // The model name of wakenet 1
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
det_mode_t wakenet_mode; // The mode of wakenet
/********** AGC(Automatic Gain Control) **********/
bool agc_init; // Whether to init agc
afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
int agc_compression_gain_db; // Compression gain in dB (default 9)
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
bool agc_init; // Whether to init agc
afe_agc_mode_t
agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
int agc_compression_gain_db; // Compression gain in dB (default 9)
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
/********** General AFE(Audio Front End) parameter **********/
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
afe_mode_t afe_mode; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
afe_type_t afe_type; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude.
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
afe_mode_t afe_mode; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
afe_type_t afe_type; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
// directly on the output amplitude: out_linear_gain * amplitude.
bool debug_init;
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
// otherwise, select channel number by wakenet
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
// otherwise, select channel number by wakenet
} afe_config_t;
/**
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format.
* You can manually fine-tune it after creating the configuration
*
* The input format:
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
* on the chip target and input format. You can manually fine-tune it after creating the configuration
*
* The input format:
* M to represent the microphone channel
* R to represent the playback reference channel
* N to represent an unknown or unused channel
*
* For example, input_format="MMNR" indicates that the input data consists of four channels,
*
* For example, input_format="MMNR" indicates that the input data consists of four channels,
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
*
*
* @param input_format The input format
* @param models Models from partition, which is configured by Kconfig
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
*
*
* @return afe_config_t* The default config of afe
*/
afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
/**
* @brief Check AFE configuration and make sure it is correct.
*
* @warning If there is a configuration conflict, this function will modify some parameters.
*
* @warning If there is a configuration conflict, this function will modify some parameters.
* The guiding behind these modifications is to maintain the highest performance of the output audio and results.
* And remove the conflict between different algorithms.
*
*
* For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
* If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
*
*
* @param afe_config Input AFE config
*
*
* @return afe_config_t* The modified AFE config
*/
afe_config_t *afe_config_check(afe_config_t *afe_config);
/**
* @brief Parse input format
*
*
* @param input_format The input format, same with afe_config_init() function
* @param pcm_config The pcm config
*
*
* @return true if the input format is parsed successfully, otherwise false
*/
bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config);
bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);
/**
* @brief Parse I2S input data
*
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param mic_data The output microphone data
* @param ref_data The output playback reference data
* @param pcm_config The pcm config
*
*
*/
void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config);
void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);
/**
* @brief Parse input data, from interleaved arrangement to contiguous arrangement
*
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param channel_num The channel number of data
* @param out_data The output data
*
*
*/
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
/**
* @brief Format input data, from contiguous arrangement to interleaved arrangement
*
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param channel_num The channel number of data
* @param out_data The output data
*
*
*/
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
/**
* @brief Adjust the gain of input data
*
*
* @warning the input data will be modified inplace.
*
*
* @param data The input audio data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param factor The gain factor
*
*
* @return int16_t* The output audio data
*/
int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);
/**
* @brief Adjust the gain of input data
*
*
* @warning the input data will be modified inplace.
*
*
* @param in_data The input audio data
* @param in_frame_size Input data frame size of input
* @param channel_num The channel number of input data, which is same as output data
@ -243,35 +247,35 @@ int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
* @param out_frame_size Onput data frame size of input
*
*/
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size);
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);
/**
* @brief Copy the afe config
*
*
* @param dst_config The destination afe config
* @param src_config The source afe config
*
*
* @return The destination afe config
*/
afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
/**
* @brief Print the afe config
*
*
* @param afe_config The afe config
*/
void afe_config_print(const afe_config_t *afe_config);
/**
* @brief Allocate afe config
*
*
* @return The afe config pointer
*/
afe_config_t *afe_config_alloc();
/**
* @brief Free afe config
*
*
* @param afe_config The afe config pointer
*/
void afe_config_free(afe_config_t *afe_config);

View File

@ -1,62 +1,61 @@
#pragma once
#include "esp_afe_config.h"
#include "stdbool.h"
#include "stdint.h"
#include "stdlib.h"
#include "stdbool.h"
#include "esp_afe_config.h"
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#ifdef __cplusplus
extern "C" {
#endif
//AFE: Audio Front-End
//SR: Speech Recognition
//afe_sr/AFE_SR: the audio front-end for speech recognition
// AFE: Audio Front-End
// SR: Speech Recognition
// afe_sr/AFE_SR: the audio front-end for speech recognition
//Opaque AFE_SR data container
// Opaque AFE_SR data container
typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
/**
* @brief The state of vad
*/
typedef enum
{
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
typedef enum {
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
} afe_vad_state_t;
/**
* @brief The result of fetch function
*/
typedef struct afe_fetch_result_t
{
int16_t *data; // the target channel data of audio.
int data_size; // the size of data. The unit is byte.
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
int vad_cache_size; // the size of vad_cache. The unit is byte.
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc).
// if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length.
wakenet_state_t wakeup_state; // the value is wakenet_state_t
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
vad_state_t vad_state; // the value is afe_vad_state_t
int trigger_channel_id; // the channel index of output
int wake_word_length; // the length of wake word. The unit is the number of samples.
int ret_value; // the return state of fetch function
int16_t *raw_data; // the multi-channel output data of audio.
int raw_data_channels; // the channel number of raw data
void* reserved; // reserved for future use
typedef struct afe_fetch_result_t {
int16_t *data; // the target channel data of audio.
int data_size; // the size of data. The unit is byte.
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
// audio that was truncated.
int vad_cache_size; // the size of vad_cache. The unit is byte.
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
// (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
// wakenet(about 1.5s), otherwise is the frame length.
wakenet_state_t wakeup_state; // the value is wakenet_state_t
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
// start from 1.
vad_state_t vad_state; // the value is afe_vad_state_t
int trigger_channel_id; // the channel index of output
int wake_word_length; // the length of wake word. The unit is the number of samples.
int ret_value; // the return state of fetch function
int16_t *raw_data; // the multi-channel output data of audio.
int raw_data_channels; // the channel number of raw data
void *reserved; // reserved for future use
} afe_fetch_result_t;
/**
* @brief Function to initialze a AFE_SR instance
*
*
* @param afe_config The config of AFE_SR
* @returns Handle to the AFE_SR data
*/
typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
/**
* @brief Get the amount of each channel samples per frame that need to be passed to the function
@ -71,7 +70,7 @@ typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
/**
* @brief Get the channel number
*
*
* @param afe The AFE_SR object to query
* @return The amount of total channels
*/
@ -92,12 +91,12 @@ typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
* The last channel is reference signal if it has reference data.
*
* @param afe The AFE_SR object to query
*
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
*
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
* `get_feed_chunksize`.
* @return The size of input
*/
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in);
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);
/**
* @brief fetch enhanced samples of an audio stream from the AFE_SR
@ -106,9 +105,10 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t*
* Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
*
* @param afe The AFE_SR object to query
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
* audio can be queried by the `get_fetch_chunksize`.)
*/
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
/**
* @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
@ -117,9 +117,10 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *af
*
* @param afe The AFE_SR object to query
* @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result.
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
* audio can be queried by the `get_fetch_chunksize`.)
*/
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
/**
* @brief reset ringbuf of AFE.
@ -130,14 +131,14 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr
typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
/**
* @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
* @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
* when wakenet has been initialized. It's only support wakenet 1 now.
*
* @param afe The AFE_SR object to query
* @param wakenet_word The wakenet word, should be DEFAULT_WAKE_WORD or EXTRA_WAKE_WORD
* @return -1: fail, 1: success
*/
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name);
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name);
/**
* @brief Enable VAD algorithm.
@ -179,7 +180,6 @@ typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
*/
typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
/**
* This structure contains the functions used to do operations on a AFE_SR.
*/
@ -191,11 +191,11 @@ typedef struct {
esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
esp_afe_sr_iface_op_disable_func_t disable_wakenet;
esp_afe_sr_iface_op_enable_func_t enable_wakenet;
esp_afe_sr_iface_op_disable_func_t disable_aec;
@ -212,16 +212,14 @@ typedef struct {
esp_afe_sr_iface_op_destroy_t destroy;
} esp_afe_sr_iface_t;
// struct is used to store the AFE handle and data for the AFE task
typedef struct
{
typedef struct {
esp_afe_sr_data_t *afe_data;
esp_afe_sr_iface_t *afe_handle;
TaskHandle_t feed_task;
TaskHandle_t fetch_task;
}afe_task_into_t;
TaskHandle_t feed_task;
TaskHandle_t fetch_task;
} afe_task_into_t;
#ifdef __cplusplus
}
#endif
#endif

View File

@ -10,4 +10,4 @@ esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,89 @@
#pragma once
#include <stdint.h>
#include "esp_speech_features.h"
/*
This describes an interface for a MFCC runner, that is, some kind of implementation that can be
fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so
multiple implementations can be used.
*/
typedef struct esp_mfcc_data_t esp_mfcc_data_t;
//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please
//refer to its documentation for details.
typedef struct {
int winstep_ms; // The step between successive windows in ms. (10)
int winlen_ms; // The length of the analysis window in ms. (25)
int nch; // The number of input channel
int numcep; // The number of cepstrum to return
int nfilter; // The number of filters in the filterbank
int nfft; // The FFT size
int samp_freq; // The sample-rate of the signal.
int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0)
int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
char *win_type; // Analysis window type to apply to each frame "hanning","hamming","sine","rectangular","povey"
bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum
int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
float log_epsilon; // log epsilon. (e.g. 1e-7)
bool psram_first; // Alloc memory from PSRAM first
bool remove_dc_offset; // Whether to subtract mean of wave before FFT
} esp_mfcc_opts_t;
/**
* @brief Un-initialize and free a mfcc runner
*
* Function to free a previously allocated mfcc runner.
*
* @param r Runner object to destroy
*/
typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
/**
* @brief Initialize parameters for a mfcc runner.
*
* After creation, a mfcc runner needs to be initialized first; this is usually done
* in the initialization routine of a speech recognition algorithm. This provides
* a pointer to do this for a specific mfcc runner.
*
* @param opt Options for the mfcc process
* @return True if success, false on error.
*/
typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
/**
* @brief Run a mfcc iteration on frame by frame
*
* This will take a set of samples and return a ceptrum. Note that this may be pipelined:
* an initial call to this function may return NULL and subsequent calls may return the
* cepstrum of previous calls.
*
* @param r The mfcc runner
* @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000).
* @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function
* when done with this buffer. Note that some implementations require the buffer to be freed before another call
* to this function is done.
*/
typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
/**
* @brief Clean all state of mfcc handle
*
* @param r The mfcc runner
*/
typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r);
/**
* @brief Operations possible on a mfcc runner
*/
typedef struct {
esp_mfcc_op_destroy_t destroy;
esp_mfcc_op_create_t create;
esp_mfcc_op_run_step_t run_step;
esp_mfcc_op_clean_t clean;
} esp_mfcc_iface_t;

View File

@ -0,0 +1,40 @@
#pragma once
#include "esp_mfcc_iface.h"
extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle
/**
* @brief Return basic opts used in wakenet9 & multinet5
**/
esp_mfcc_opts_t *get_mfcc_opts_wn9();
/**
* @brief Return basic opts for default kaldifeat
*
opts->psram_first = true;
opts->use_power = true;
opts->use_log_fbank = 2; // log(max(x, log_epsilon))
opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps
opts->win_type = "povey";
opts->low_freq = 20;
opts->high_freq = 7600;
opts->samp_freq = 16000;
opts->nch = 1;
opts->nfft = 512;
opts->nfilter = 80;
opts->numcep = 80;
opts->preemph = 0.97;
opts->append_energy = false;
opts->winlen_ms = 25;
opts->winstep_ms = 10;
opts->remove_dc_offset = true;
*
**/
esp_mfcc_opts_t *get_mfcc_opts_kaldi();
/**
* @brief Print mfcc opts
**/
void print_mfcc_opts(esp_mfcc_opts_t *opts);

View File

@ -0,0 +1,64 @@
#pragma once
#include "c_speech_features_config.h"
#include "stdlib.h"
#include <assert.h>
#include <stdbool.h>
#ifndef M_2PI
#define M_2PI 6.283185307179586476925286766559005
#endif
typedef struct
{
float *coeff;
int *bank_pos;
int nfilter;
} esp_mel_filter_t;
float* esp_mfcc_malloc(size_t size, bool from_psram);
void esp_mfcc_free(void *ptr);
/**
* @brief Initialize FFT table
* @warning For ESP-PLATFORM, use esp-dsp fft
* For Other platform, use kiss fft
*
* @param nfft The input samples number
* @return fft-table
**/
void* esp_fft_init(int nfft);
/**
* @brief Free FFT table
* @warning For ESP-PLATFORM, use esp-dsp fft
* For Other platform, use kiss fft
*
* @param fft_table The fft table initialized by esp_fft_init
* @param nfft The input samples number
* @return fft-table
**/
void esp_fft_deinit(void *fft_table, int nfft);
/**
* @brief Initial window function
* Currently support hanning, hamming, sine, povey, rectangular,
* wn9(512-hanning to get wakenet9& multinet5 compatible)
**/
float *esp_win_func_init(char *win_type, float* window_data, int frame_length);
float* esp_fftr(float* x, int nfft, void *fft_table);
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);
esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq,
bool from_psram);
void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);
float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank,
float epsilon);

View File

@ -0,0 +1,29 @@
#pragma once
#include <float.h>
#include <math.h>
/* #undef ENABLE_DOUBLE */
#ifdef ENABLE_DOUBLE
# define csf_float double
# define csf_ceil ceil
# define csf_floor floor
# define csf_sin sin
# define csf_log log
# define csf_log10 log10
# define csf_pow pow
# define csf_sqrt sqrt
# define csf_abs fabs
# define csf_float_min DBL_MIN
#else
# define csf_float float
# define csf_ceil ceilf
# define csf_floor floorf
# define csf_sin sinf
# define csf_log logf
# define csf_log10 log10f
# define csf_pow powf
# define csf_sqrt sqrtf
# define csf_abs fabsf
# define csf_float_min FLT_MIN
#endif

View File

@ -1,241 +1,245 @@
#pragma once
#include "stdint.h"
#include "stdbool.h"
#include "stdlib.h"
#include "esp_wn_iface.h"
#include "esp_wn_models.h"
#include "esp_vad.h"
#include "esp_aec.h"
#include "esp_agc.h"
#include "model_path.h"
#include "esp_vadn_models.h"
#include "esp_nsn_models.h"
#include "esp_vad.h"
#include "esp_vadn_models.h"
#include "esp_wn_iface.h"
#include "esp_wn_models.h"
#include "model_path.h"
#include "stdbool.h"
#include "stdint.h"
#include "stdlib.h"
#ifdef __cplusplus
extern "C" {
#endif
//AFE: Audio Front-End
//SR: Speech Recognition
//VC: Voice Communication
// AFE: Audio Front-End
// SR: Speech Recognition
// VC: Voice Communication
//Set AFE_SR mode
// Set AFE_SR mode
typedef enum {
SR_MODE_LOW_COST = 0, //Deprecated, please use afe_mode_t, AFE mode: low cost mode
SR_MODE_HIGH_PERF = 1, //Deprecated, please use afe_mode_t, AFE mode: high performance mode
SR_MODE_LOW_COST = 0, // Deprecated, please use afe_mode_t, AFE mode: low cost mode
SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode
} afe_sr_mode_t;
//Set AFE mode
// Set AFE mode
typedef enum {
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
} afe_mode_t;
//Set AFE type
// Set AFE type
typedef enum {
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
} afe_type_t;
typedef enum {
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram
} afe_memory_alloc_mode_t;
typedef enum {
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
} afe_mn_peak_agc_mode_t;
typedef struct {
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
int mic_num; // microphone channel number
uint8_t* mic_ids; // microphone channel indices
int ref_num; // playback reference channel number
uint8_t* ref_ids; // playback reference channel indices
int sample_rate; // sample rate of audio
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
int mic_num; // microphone channel number
uint8_t *mic_ids; // microphone channel indices
int ref_num; // playback reference channel number
uint8_t *ref_ids; // playback reference channel indices
int sample_rate; // sample rate of audio
} afe_pcm_config_t;
typedef enum {
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
AFE_NS_MODE_NET = 1, // please use model name of NSNET
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
AFE_NS_MODE_NET = 1, // please use model name of NSNET
} afe_ns_mode_t;
typedef enum {
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
} afe_agc_mode_t;
/**
* @brief Function to get the debug audio data
*
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that avoid blocking for too long.
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that
* avoid blocking for too long.
* @param data_size The number of bytes of data.
* @returns
*/
typedef void (*afe_debug_hook_callback_t)(const int16_t* data, int data_size);
typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);
typedef enum {
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
AFE_DEBUG_HOOK_MAX = 2
} afe_debug_hook_type_t;
typedef struct {
afe_debug_hook_type_t hook_type; // debug type of hook
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
afe_debug_hook_type_t hook_type; // debug type of hook
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
} afe_debug_hook_t;
typedef struct {
/********** AEC(Acoustic Echo Cancellation) **********/
bool aec_init; // Whether to init aec
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
int aec_filter_length; // The filter length of aec
bool aec_init; // Whether to init aec
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
int aec_filter_length; // The filter length of aec
/********** SE(Speech Enhancement, microphone array processing) **********/
bool se_init; // Whether to init se
bool se_init; // Whether to init se
/********** NS(Noise Suppression) **********/
bool ns_init; // Whether to init ns
char *ns_model_name; // Model name of ns
afe_ns_mode_t afe_ns_mode; // Model mode of ns
bool ns_init; // Whether to init ns
char *ns_model_name; // Model name of ns
afe_ns_mode_t afe_ns_mode; // Model mode of ns
/********** VAD(Voice Activity Detection) **********/
bool vad_init; // Whether to init vad
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
bool vad_init; // Whether to init vad
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
// 1000 ms
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
/********** WakeNet(Wake Word Engine) **********/
bool wakenet_init;
char *wakenet_model_name; // The model name of wakenet 1
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
det_mode_t wakenet_mode; // The mode of wakenet
char *wakenet_model_name; // The model name of wakenet 1
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
det_mode_t wakenet_mode; // The mode of wakenet
/********** AGC(Automatic Gain Control) **********/
bool agc_init; // Whether to init agc
afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
int agc_compression_gain_db; // Compression gain in dB (default 9)
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
bool agc_init; // Whether to init agc
afe_agc_mode_t
agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
int agc_compression_gain_db; // Compression gain in dB (default 9)
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
/********** General AFE(Audio Front End) parameter **********/
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
afe_mode_t afe_mode; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
afe_type_t afe_type; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude.
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
afe_mode_t afe_mode; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
afe_type_t afe_type; // The mode of afe AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
// directly on the output amplitude: out_linear_gain * amplitude.
bool debug_init;
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
// otherwise, select channel number by wakenet
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
// otherwise, select channel number by wakenet
} afe_config_t;
/**
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format.
* You can manually fine-tune it after creating the configuration
*
* The input format:
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
* on the chip target and input format. You can manually fine-tune it after creating the configuration
*
* The input format:
* M to represent the microphone channel
* R to represent the playback reference channel
* N to represent an unknown or unused channel
*
* For example, input_format="MMNR" indicates that the input data consists of four channels,
*
* For example, input_format="MMNR" indicates that the input data consists of four channels,
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
*
*
* @param input_format The input format
* @param models Models from partition, which is configured by Kconfig
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
*
*
* @return afe_config_t* The default config of afe
*/
afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
/**
* @brief Check AFE configuration and make sure it is correct.
*
* @warning If there is a configuration conflict, this function will modify some parameters.
*
* @warning If there is a configuration conflict, this function will modify some parameters.
* The guiding behind these modifications is to maintain the highest performance of the output audio and results.
* And remove the conflict between different algorithms.
*
*
* For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
* If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
*
*
* @param afe_config Input AFE config
*
*
* @return afe_config_t* The modified AFE config
*/
afe_config_t *afe_config_check(afe_config_t *afe_config);
/**
* @brief Parse input format
*
*
* @param input_format The input format, same with afe_config_init() function
* @param pcm_config The pcm config
*
*
* @return true if the input format is parsed successfully, otherwise false
*/
bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config);
bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);
/**
* @brief Parse I2S input data
*
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param mic_data The output microphone data
* @param ref_data The output playback reference data
* @param pcm_config The pcm config
*
*
*/
void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config);
void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);
/**
* @brief Parse input data, from interleaved arrangement to contiguous arrangement
*
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param channel_num The channel number of data
* @param out_data The output data
*
*
*/
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
/**
* @brief Format input data, from contiguous arrangement to interleaved arrangement
*
*
* @param data The input multi channel data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param channel_num The channel number of data
* @param out_data The output data
*
*
*/
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
/**
* @brief Adjust the gain of input data
*
*
* @warning the input data will be modified inplace.
*
*
* @param data The input audio data
* @param frame_size The frame size of input, it is also the size of single channel data
* @param factor The gain factor
*
*
* @return int16_t* The output audio data
*/
int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);
/**
* @brief Adjust the gain of input data
*
*
* @warning the input data will be modified inplace.
*
*
* @param in_data The input audio data
* @param in_frame_size Input data frame size of input
* @param channel_num The channel number of input data, which is same as output data
@ -243,35 +247,35 @@ int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
* @param out_frame_size Onput data frame size of input
*
*/
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size);
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);
/**
* @brief Copy the afe config
*
*
* @param dst_config The destination afe config
* @param src_config The source afe config
*
*
* @return The destination afe config
*/
afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
/**
* @brief Print the afe config
*
*
* @param afe_config The afe config
*/
void afe_config_print(const afe_config_t *afe_config);
/**
* @brief Allocate afe config
*
*
* @return The afe config pointer
*/
afe_config_t *afe_config_alloc();
/**
* @brief Free afe config
*
*
* @param afe_config The afe config pointer
*/
void afe_config_free(afe_config_t *afe_config);

View File

@ -1,62 +1,61 @@
#pragma once
#include "esp_afe_config.h"
#include "stdbool.h"
#include "stdint.h"
#include "stdlib.h"
#include "stdbool.h"
#include "esp_afe_config.h"
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#ifdef __cplusplus
extern "C" {
#endif
//AFE: Audio Front-End
//SR: Speech Recognition
//afe_sr/AFE_SR: the audio front-end for speech recognition
// AFE: Audio Front-End
// SR: Speech Recognition
// afe_sr/AFE_SR: the audio front-end for speech recognition
//Opaque AFE_SR data container
// Opaque AFE_SR data container
typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
/**
* @brief The state of vad
*/
typedef enum
{
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
typedef enum {
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
} afe_vad_state_t;
/**
* @brief The result of fetch function
*/
typedef struct afe_fetch_result_t
{
int16_t *data; // the target channel data of audio.
int data_size; // the size of data. The unit is byte.
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
int vad_cache_size; // the size of vad_cache. The unit is byte.
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc).
// if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length.
wakenet_state_t wakeup_state; // the value is wakenet_state_t
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
vad_state_t vad_state; // the value is afe_vad_state_t
int trigger_channel_id; // the channel index of output
int wake_word_length; // the length of wake word. The unit is the number of samples.
int ret_value; // the return state of fetch function
int16_t *raw_data; // the multi-channel output data of audio.
int raw_data_channels; // the channel number of raw data
void* reserved; // reserved for future use
typedef struct afe_fetch_result_t {
int16_t *data; // the target channel data of audio.
int data_size; // the size of data. The unit is byte.
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
// audio that was truncated.
int vad_cache_size; // the size of vad_cache. The unit is byte.
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
// (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
// wakenet(about 1.5s), otherwise is the frame length.
wakenet_state_t wakeup_state; // the value is wakenet_state_t
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
// start from 1.
vad_state_t vad_state; // the value is afe_vad_state_t
int trigger_channel_id; // the channel index of output
int wake_word_length; // the length of wake word. The unit is the number of samples.
int ret_value; // the return state of fetch function
int16_t *raw_data; // the multi-channel output data of audio.
int raw_data_channels; // the channel number of raw data
void *reserved; // reserved for future use
} afe_fetch_result_t;
/**
* @brief Function to initialze a AFE_SR instance
*
*
* @param afe_config The config of AFE_SR
* @returns Handle to the AFE_SR data
*/
typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
/**
* @brief Get the amount of each channel samples per frame that need to be passed to the function
@ -71,7 +70,7 @@ typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
/**
* @brief Get the channel number
*
*
* @param afe The AFE_SR object to query
* @return The amount of total channels
*/
@ -92,12 +91,12 @@ typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
* The last channel is reference signal if it has reference data.
*
* @param afe The AFE_SR object to query
*
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
*
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
* `get_feed_chunksize`.
* @return The size of input
*/
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in);
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);
/**
* @brief fetch enhanced samples of an audio stream from the AFE_SR
@ -106,9 +105,10 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t*
* Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
*
* @param afe The AFE_SR object to query
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
* audio can be queried by the `get_fetch_chunksize`.)
*/
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
/**
* @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
@ -117,9 +117,10 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *af
*
* @param afe The AFE_SR object to query
* @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result.
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
* audio can be queried by the `get_fetch_chunksize`.)
*/
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
/**
* @brief reset ringbuf of AFE.
@ -130,14 +131,14 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr
typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
/**
* @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
* @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
* when wakenet has been initialized. It's only support wakenet 1 now.
*
* @param afe The AFE_SR object to query
* @param wakenet_word The wakenet word, should be DEFAULT_WAKE_WORD or EXTRA_WAKE_WORD
* @return -1: fail, 1: success
*/
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name);
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name);
/**
* @brief Enable VAD algorithm.
@ -179,7 +180,6 @@ typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
*/
typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
/**
* This structure contains the functions used to do operations on a AFE_SR.
*/
@ -191,11 +191,11 @@ typedef struct {
esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
esp_afe_sr_iface_op_disable_func_t disable_wakenet;
esp_afe_sr_iface_op_enable_func_t enable_wakenet;
esp_afe_sr_iface_op_disable_func_t disable_aec;
@ -212,16 +212,14 @@ typedef struct {
esp_afe_sr_iface_op_destroy_t destroy;
} esp_afe_sr_iface_t;
// struct is used to store the AFE handle and data for the AFE task
typedef struct
{
typedef struct {
esp_afe_sr_data_t *afe_data;
esp_afe_sr_iface_t *afe_handle;
TaskHandle_t feed_task;
TaskHandle_t fetch_task;
}afe_task_into_t;
TaskHandle_t feed_task;
TaskHandle_t fetch_task;
} afe_task_into_t;
#ifdef __cplusplus
}
#endif
#endif

View File

@ -10,4 +10,4 @@ esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
#ifdef __cplusplus
}
#endif
#endif

View File

@ -0,0 +1,89 @@
#pragma once
#include <stdint.h>
#include "esp_speech_features.h"
/*
This describes an interface for a MFCC runner, that is, some kind of implementation that can be
fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so
multiple implementations can be used.
*/
typedef struct esp_mfcc_data_t esp_mfcc_data_t;
//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please
//refer to its documentation for details.
typedef struct {
int winstep_ms; // The step between successive windows in ms. (10)
int winlen_ms; // The length of the analysis window in ms. (25)
int nch; // The number of input channel
int numcep; // The number of cepstrum to return
int nfilter; // The number of filters in the filterbank
int nfft; // The FFT size
int samp_freq; // The sample-rate of the signal.
int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0)
int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
char *win_type; // Analysis window type to apply to each frame "hanning","hamming","sine","rectangular","povey"
bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum
int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
float log_epsilon; // log epsilon. (e.g. 1e-7)
bool psram_first; // Alloc memory from PSRAM first
bool remove_dc_offset; // Whether to subtract mean of wave before FFT
} esp_mfcc_opts_t;
/**
* @brief Un-initialize and free a mfcc runner
*
* Function to free a previously allocated mfcc runner.
*
* @param r Runner object to destroy
*/
typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
/**
* @brief Initialize parameters for a mfcc runner.
*
* After creation, a mfcc runner needs to be initialized first; this is usually done
* in the initialization routine of a speech recognition algorithm. This provides
* a pointer to do this for a specific mfcc runner.
*
* @param opt Options for the mfcc process
* @return True if success, false on error.
*/
typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
/**
* @brief Run a mfcc iteration on frame by frame
*
* This will take a set of samples and return a ceptrum. Note that this may be pipelined:
* an initial call to this function may return NULL and subsequent calls may return the
* cepstrum of previous calls.
*
* @param r The mfcc runner
* @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000).
* @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function
* when done with this buffer. Note that some implementations require the buffer to be freed before another call
* to this function is done.
*/
typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
/**
* @brief Clean all state of mfcc handle
*
* @param r The mfcc runner
*/
typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r);
/**
* @brief Operations possible on a mfcc runner
*/
typedef struct {
esp_mfcc_op_destroy_t destroy;
esp_mfcc_op_create_t create;
esp_mfcc_op_run_step_t run_step;
esp_mfcc_op_clean_t clean;
} esp_mfcc_iface_t;

View File

@ -0,0 +1,40 @@
#pragma once
#include "esp_mfcc_iface.h"
extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle
/**
* @brief Return basic opts used in wakenet9 & multinet5
**/
esp_mfcc_opts_t *get_mfcc_opts_wn9();
/**
* @brief Return basic opts for default kaldifeat
*
opts->psram_first = true;
opts->use_power = true;
opts->use_log_fbank = 2; // log(max(x, log_epsilon))
opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps
opts->win_type = "povey";
opts->low_freq = 20;
opts->high_freq = 7600;
opts->samp_freq = 16000;
opts->nch = 1;
opts->nfft = 512;
opts->nfilter = 80;
opts->numcep = 80;
opts->preemph = 0.97;
opts->append_energy = false;
opts->winlen_ms = 25;
opts->winstep_ms = 10;
opts->remove_dc_offset = true;
*
**/
esp_mfcc_opts_t *get_mfcc_opts_kaldi();
/**
* @brief Print mfcc opts
**/
void print_mfcc_opts(esp_mfcc_opts_t *opts);

View File

@ -0,0 +1,64 @@
#pragma once
#include "c_speech_features_config.h"
#include "stdlib.h"
#include <assert.h>
#include <stdbool.h>
#ifndef M_2PI
#define M_2PI 6.283185307179586476925286766559005
#endif
typedef struct
{
float *coeff;
int *bank_pos;
int nfilter;
} esp_mel_filter_t;
float* esp_mfcc_malloc(size_t size, bool from_psram);
void esp_mfcc_free(void *ptr);
/**
* @brief Initialize FFT table
* @warning For ESP-PLATFORM, use esp-dsp fft
* For Other platform, use kiss fft
*
* @param nfft The input samples number
* @return fft-table
**/
void* esp_fft_init(int nfft);
/**
* @brief Free FFT table
* @warning For ESP-PLATFORM, use esp-dsp fft
* For Other platform, use kiss fft
*
* @param fft_table The fft table initialized by esp_fft_init
* @param nfft The input samples number
* @return fft-table
**/
void esp_fft_deinit(void *fft_table, int nfft);
/**
* @brief Initial window function
* Currently support hanning, hamming, sine, povey, rectangular,
* wn9(512-hanning to get wakenet9& multinet5 compatible)
**/
float *esp_win_func_init(char *win_type, float* window_data, int frame_length);
float* esp_fftr(float* x, int nfft, void *fft_table);
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);
esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq,
bool from_psram);
void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);
float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank,
float epsilon);

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -4,6 +4,7 @@ set(srcs
"test_wakenet.cpp"
"test_multinet.cpp"
"test_afe.cpp"
"test_mfcc.cpp"
)
idf_component_register(SRCS ${srcs}

View File

@ -0,0 +1,63 @@
#include <stdio.h>
#include <stdlib.h>
#include "string.h"
#include <limits.h>
#include "unity.h"
#include "esp_log.h"
#include "esp_heap_caps.h"
#include "esp_mfcc_iface.h"
#include "esp_mfcc_models.h"
#include "alexa.h"
esp_mfcc_opts_t *get_fbank_opts_kaldi(int dim)
{
esp_mfcc_opts_t *opts = (esp_mfcc_opts_t*)malloc(sizeof(esp_mfcc_opts_t));
opts->psram_first = true;
opts->use_power = true;
opts->use_log_fbank = 0; // log(max(x, log_epsilon))
opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps
opts->win_type = const_cast<char*>("hanning"); // remove [-Wwrite-strings] warning
opts->low_freq = 20;
opts->high_freq = 7600;
opts->samp_freq = 16000;
opts->nch = 1;
opts->nfft = 512;
opts->nfilter = 80;
opts->numcep = dim;
opts->preemph = 0.97;
opts->append_energy = false;
opts->winlen_ms = 25;
opts->winstep_ms = 10;
opts->remove_dc_offset = true;
return opts;
}
TEST_CASE("test speech features", "[fbank]")
{
int16_t *buffer = (int16_t *) malloc(512 * sizeof(int16_t));
const esp_mfcc_iface_t *fbank_handle = &esp_fbank_f32;
float* fbank_out = NULL;
int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
// MFCC init
int out_dim = 80;
esp_mfcc_data_t *fbank_model = fbank_handle->create(get_fbank_opts_kaldi(out_dim));
memcpy(buffer, alexa, 512 * sizeof(int16_t));
fbank_out = fbank_handle->run_step(fbank_model, buffer, 0);
fbank_handle->destroy(fbank_model);
int first_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
for (int i = 0; i < out_dim; i++) {
printf("%f ", fbank_out[i]);
}
fbank_model = fbank_handle->create(get_fbank_opts_kaldi(out_dim));
memcpy(buffer, alexa, 512 * sizeof(int16_t));
fbank_out = fbank_handle->run_step(fbank_model, buffer, 0);
fbank_handle->destroy(fbank_model);
int second_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
TEST_ASSERT_EQUAL(true, start_size - first_end_size < 100); // there are some memory leak in system
TEST_ASSERT_EQUAL(true, first_end_size == second_end_size);
}