mirror of
https://github.com/espressif/esp-sr.git
synced 2025-09-15 15:28:44 +08:00
Merge branch 'feat/add_mfcc_interface' into 'master'
feat: add mfcc interface See merge request speech-recognition-framework/esp-sr!134
This commit is contained in:
commit
2993ce18ce
29
include/esp32/c_speech_features_config.h
Normal file
29
include/esp32/c_speech_features_config.h
Normal file
@ -0,0 +1,29 @@
|
||||
#pragma once
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
|
||||
/* #undef ENABLE_DOUBLE */
|
||||
|
||||
#ifdef ENABLE_DOUBLE
|
||||
# define csf_float double
|
||||
# define csf_ceil ceil
|
||||
# define csf_floor floor
|
||||
# define csf_sin sin
|
||||
# define csf_log log
|
||||
# define csf_log10 log10
|
||||
# define csf_pow pow
|
||||
# define csf_sqrt sqrt
|
||||
# define csf_abs fabs
|
||||
# define csf_float_min DBL_MIN
|
||||
#else
|
||||
# define csf_float float
|
||||
# define csf_ceil ceilf
|
||||
# define csf_floor floorf
|
||||
# define csf_sin sinf
|
||||
# define csf_log logf
|
||||
# define csf_log10 log10f
|
||||
# define csf_pow powf
|
||||
# define csf_sqrt sqrtf
|
||||
# define csf_abs fabsf
|
||||
# define csf_float_min FLT_MIN
|
||||
#endif
|
||||
@ -1,241 +1,245 @@
|
||||
#pragma once
|
||||
#include "stdint.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdlib.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#include "esp_wn_models.h"
|
||||
#include "esp_vad.h"
|
||||
#include "esp_aec.h"
|
||||
#include "esp_agc.h"
|
||||
#include "model_path.h"
|
||||
#include "esp_vadn_models.h"
|
||||
#include "esp_nsn_models.h"
|
||||
#include "esp_vad.h"
|
||||
#include "esp_vadn_models.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#include "esp_wn_models.h"
|
||||
#include "model_path.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdint.h"
|
||||
#include "stdlib.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//AFE: Audio Front-End
|
||||
//SR: Speech Recognition
|
||||
//VC: Voice Communication
|
||||
// AFE: Audio Front-End
|
||||
// SR: Speech Recognition
|
||||
// VC: Voice Communication
|
||||
|
||||
//Set AFE_SR mode
|
||||
// Set AFE_SR mode
|
||||
typedef enum {
|
||||
SR_MODE_LOW_COST = 0, //Deprecated, please use afe_mode_t, AFE mode: low cost mode
|
||||
SR_MODE_HIGH_PERF = 1, //Deprecated, please use afe_mode_t, AFE mode: high performance mode
|
||||
SR_MODE_LOW_COST = 0, // Deprecated, please use afe_mode_t, AFE mode: low cost mode
|
||||
SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode
|
||||
} afe_sr_mode_t;
|
||||
|
||||
//Set AFE mode
|
||||
// Set AFE mode
|
||||
typedef enum {
|
||||
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
|
||||
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
|
||||
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
|
||||
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
|
||||
} afe_mode_t;
|
||||
|
||||
//Set AFE type
|
||||
// Set AFE type
|
||||
typedef enum {
|
||||
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
|
||||
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
|
||||
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
|
||||
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
|
||||
} afe_type_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
|
||||
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
|
||||
AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram
|
||||
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
|
||||
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
|
||||
AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram
|
||||
} afe_memory_alloc_mode_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
|
||||
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
|
||||
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
|
||||
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
|
||||
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
|
||||
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
|
||||
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
|
||||
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
|
||||
} afe_mn_peak_agc_mode_t;
|
||||
|
||||
typedef struct {
|
||||
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
|
||||
int mic_num; // microphone channel number
|
||||
uint8_t* mic_ids; // microphone channel indices
|
||||
int ref_num; // playback reference channel number
|
||||
uint8_t* ref_ids; // playback reference channel indices
|
||||
int sample_rate; // sample rate of audio
|
||||
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
|
||||
int mic_num; // microphone channel number
|
||||
uint8_t *mic_ids; // microphone channel indices
|
||||
int ref_num; // playback reference channel number
|
||||
uint8_t *ref_ids; // playback reference channel indices
|
||||
int sample_rate; // sample rate of audio
|
||||
} afe_pcm_config_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
|
||||
AFE_NS_MODE_NET = 1, // please use model name of NSNET
|
||||
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
|
||||
AFE_NS_MODE_NET = 1, // please use model name of NSNET
|
||||
} afe_ns_mode_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
|
||||
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
|
||||
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
|
||||
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
|
||||
} afe_agc_mode_t;
|
||||
|
||||
/**
|
||||
* @brief Function to get the debug audio data
|
||||
*
|
||||
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that avoid blocking for too long.
|
||||
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that
|
||||
* avoid blocking for too long.
|
||||
* @param data_size The number of bytes of data.
|
||||
* @returns
|
||||
*/
|
||||
typedef void (*afe_debug_hook_callback_t)(const int16_t* data, int data_size);
|
||||
typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);
|
||||
|
||||
typedef enum {
|
||||
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
|
||||
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
|
||||
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
|
||||
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
|
||||
AFE_DEBUG_HOOK_MAX = 2
|
||||
} afe_debug_hook_type_t;
|
||||
|
||||
typedef struct {
|
||||
afe_debug_hook_type_t hook_type; // debug type of hook
|
||||
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
|
||||
afe_debug_hook_type_t hook_type; // debug type of hook
|
||||
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
|
||||
} afe_debug_hook_t;
|
||||
|
||||
typedef struct {
|
||||
/********** AEC(Acoustic Echo Cancellation) **********/
|
||||
bool aec_init; // Whether to init aec
|
||||
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
|
||||
int aec_filter_length; // The filter length of aec
|
||||
bool aec_init; // Whether to init aec
|
||||
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
|
||||
int aec_filter_length; // The filter length of aec
|
||||
|
||||
/********** SE(Speech Enhancement, microphone array processing) **********/
|
||||
bool se_init; // Whether to init se
|
||||
bool se_init; // Whether to init se
|
||||
|
||||
/********** NS(Noise Suppression) **********/
|
||||
bool ns_init; // Whether to init ns
|
||||
char *ns_model_name; // Model name of ns
|
||||
afe_ns_mode_t afe_ns_mode; // Model mode of ns
|
||||
|
||||
bool ns_init; // Whether to init ns
|
||||
char *ns_model_name; // Model name of ns
|
||||
afe_ns_mode_t afe_ns_mode; // Model mode of ns
|
||||
|
||||
/********** VAD(Voice Activity Detection) **********/
|
||||
bool vad_init; // Whether to init vad
|
||||
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
|
||||
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
|
||||
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
|
||||
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms
|
||||
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
|
||||
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
|
||||
bool vad_init; // Whether to init vad
|
||||
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
|
||||
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
|
||||
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
|
||||
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
|
||||
// 1000 ms
|
||||
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
|
||||
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
|
||||
|
||||
/********** WakeNet(Wake Word Engine) **********/
|
||||
bool wakenet_init;
|
||||
char *wakenet_model_name; // The model name of wakenet 1
|
||||
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
|
||||
det_mode_t wakenet_mode; // The mode of wakenet
|
||||
char *wakenet_model_name; // The model name of wakenet 1
|
||||
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
|
||||
det_mode_t wakenet_mode; // The mode of wakenet
|
||||
|
||||
/********** AGC(Automatic Gain Control) **********/
|
||||
bool agc_init; // Whether to init agc
|
||||
afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
|
||||
int agc_compression_gain_db; // Compression gain in dB (default 9)
|
||||
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
|
||||
bool agc_init; // Whether to init agc
|
||||
afe_agc_mode_t
|
||||
agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
|
||||
int agc_compression_gain_db; // Compression gain in dB (default 9)
|
||||
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
|
||||
|
||||
/********** General AFE(Audio Front End) parameter **********/
|
||||
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
|
||||
afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
|
||||
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
|
||||
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
|
||||
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
|
||||
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude.
|
||||
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
|
||||
afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
|
||||
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
|
||||
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
|
||||
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
|
||||
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
|
||||
// directly on the output amplitude: out_linear_gain * amplitude.
|
||||
bool debug_init;
|
||||
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
|
||||
// otherwise, select channel number by wakenet
|
||||
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
|
||||
// otherwise, select channel number by wakenet
|
||||
} afe_config_t;
|
||||
|
||||
/**
|
||||
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format.
|
||||
* You can manually fine-tune it after creating the configuration
|
||||
*
|
||||
* The input format:
|
||||
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
|
||||
* on the chip target and input format. You can manually fine-tune it after creating the configuration
|
||||
*
|
||||
* The input format:
|
||||
* M to represent the microphone channel
|
||||
* R to represent the playback reference channel
|
||||
* N to represent an unknown or unused channel
|
||||
*
|
||||
* For example, input_format="MMNR" indicates that the input data consists of four channels,
|
||||
*
|
||||
* For example, input_format="MMNR" indicates that the input data consists of four channels,
|
||||
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
|
||||
*
|
||||
*
|
||||
* @param input_format The input format
|
||||
* @param models Models from partition, which is configured by Kconfig
|
||||
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
|
||||
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
*
|
||||
*
|
||||
* @return afe_config_t* The default config of afe
|
||||
*/
|
||||
afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
|
||||
|
||||
/**
|
||||
* @brief Check AFE configuration and make sure it is correct.
|
||||
*
|
||||
* @warning If there is a configuration conflict, this function will modify some parameters.
|
||||
*
|
||||
* @warning If there is a configuration conflict, this function will modify some parameters.
|
||||
* The guiding behind these modifications is to maintain the highest performance of the output audio and results.
|
||||
* And remove the conflict between different algorithms.
|
||||
*
|
||||
*
|
||||
* For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
|
||||
* If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
|
||||
*
|
||||
*
|
||||
* @param afe_config Input AFE config
|
||||
*
|
||||
*
|
||||
* @return afe_config_t* The modified AFE config
|
||||
*/
|
||||
afe_config_t *afe_config_check(afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Parse input format
|
||||
*
|
||||
*
|
||||
* @param input_format The input format, same with afe_config_init() function
|
||||
* @param pcm_config The pcm config
|
||||
*
|
||||
*
|
||||
* @return true if the input format is parsed successfully, otherwise false
|
||||
*/
|
||||
bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config);
|
||||
bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);
|
||||
|
||||
/**
|
||||
* @brief Parse I2S input data
|
||||
*
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param mic_data The output microphone data
|
||||
* @param ref_data The output playback reference data
|
||||
* @param pcm_config The pcm config
|
||||
*
|
||||
*
|
||||
*/
|
||||
void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config);
|
||||
void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);
|
||||
|
||||
/**
|
||||
* @brief Parse input data, from interleaved arrangement to contiguous arrangement
|
||||
*
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param channel_num The channel number of data
|
||||
* @param out_data The output data
|
||||
*
|
||||
*
|
||||
*/
|
||||
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
|
||||
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Format input data, from contiguous arrangement to interleaved arrangement
|
||||
*
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param channel_num The channel number of data
|
||||
* @param out_data The output data
|
||||
*
|
||||
*
|
||||
*/
|
||||
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
|
||||
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Adjust the gain of input data
|
||||
*
|
||||
*
|
||||
* @warning the input data will be modified inplace.
|
||||
*
|
||||
*
|
||||
* @param data The input audio data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param factor The gain factor
|
||||
*
|
||||
*
|
||||
* @return int16_t* The output audio data
|
||||
*/
|
||||
int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
|
||||
int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);
|
||||
|
||||
/**
|
||||
* @brief Adjust the gain of input data
|
||||
*
|
||||
*
|
||||
* @warning the input data will be modified inplace.
|
||||
*
|
||||
*
|
||||
* @param in_data The input audio data
|
||||
* @param in_frame_size Input data frame size of input
|
||||
* @param channel_num The channel number of input data, which is same as output data
|
||||
@ -243,35 +247,35 @@ int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
|
||||
* @param out_frame_size Onput data frame size of input
|
||||
*
|
||||
*/
|
||||
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size);
|
||||
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);
|
||||
|
||||
/**
|
||||
* @brief Copy the afe config
|
||||
*
|
||||
*
|
||||
* @param dst_config The destination afe config
|
||||
* @param src_config The source afe config
|
||||
*
|
||||
*
|
||||
* @return The destination afe config
|
||||
*/
|
||||
afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
|
||||
afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
|
||||
|
||||
/**
|
||||
* @brief Print the afe config
|
||||
*
|
||||
*
|
||||
* @param afe_config The afe config
|
||||
*/
|
||||
void afe_config_print(const afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Allocate afe config
|
||||
*
|
||||
*
|
||||
* @return The afe config pointer
|
||||
*/
|
||||
afe_config_t *afe_config_alloc();
|
||||
|
||||
/**
|
||||
* @brief Free afe config
|
||||
*
|
||||
*
|
||||
* @param afe_config The afe config pointer
|
||||
*/
|
||||
void afe_config_free(afe_config_t *afe_config);
|
||||
|
||||
@ -1,62 +1,61 @@
|
||||
#pragma once
|
||||
#include "esp_afe_config.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdint.h"
|
||||
#include "stdlib.h"
|
||||
#include "stdbool.h"
|
||||
#include "esp_afe_config.h"
|
||||
#include "freertos/FreeRTOS.h"
|
||||
#include "freertos/task.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//AFE: Audio Front-End
|
||||
//SR: Speech Recognition
|
||||
//afe_sr/AFE_SR: the audio front-end for speech recognition
|
||||
// AFE: Audio Front-End
|
||||
// SR: Speech Recognition
|
||||
// afe_sr/AFE_SR: the audio front-end for speech recognition
|
||||
|
||||
//Opaque AFE_SR data container
|
||||
// Opaque AFE_SR data container
|
||||
typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* @brief The state of vad
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
|
||||
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
|
||||
typedef enum {
|
||||
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
|
||||
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
|
||||
} afe_vad_state_t;
|
||||
|
||||
/**
|
||||
* @brief The result of fetch function
|
||||
*/
|
||||
typedef struct afe_fetch_result_t
|
||||
{
|
||||
int16_t *data; // the target channel data of audio.
|
||||
int data_size; // the size of data. The unit is byte.
|
||||
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
|
||||
int vad_cache_size; // the size of vad_cache. The unit is byte.
|
||||
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc).
|
||||
// if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length.
|
||||
wakenet_state_t wakeup_state; // the value is wakenet_state_t
|
||||
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
|
||||
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
|
||||
vad_state_t vad_state; // the value is afe_vad_state_t
|
||||
int trigger_channel_id; // the channel index of output
|
||||
int wake_word_length; // the length of wake word. The unit is the number of samples.
|
||||
int ret_value; // the return state of fetch function
|
||||
int16_t *raw_data; // the multi-channel output data of audio.
|
||||
int raw_data_channels; // the channel number of raw data
|
||||
void* reserved; // reserved for future use
|
||||
typedef struct afe_fetch_result_t {
|
||||
int16_t *data; // the target channel data of audio.
|
||||
int data_size; // the size of data. The unit is byte.
|
||||
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
|
||||
// audio that was truncated.
|
||||
int vad_cache_size; // the size of vad_cache. The unit is byte.
|
||||
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
|
||||
// (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
|
||||
// wakenet(about 1.5s), otherwise is the frame length.
|
||||
wakenet_state_t wakeup_state; // the value is wakenet_state_t
|
||||
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
|
||||
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
|
||||
// start from 1.
|
||||
vad_state_t vad_state; // the value is afe_vad_state_t
|
||||
int trigger_channel_id; // the channel index of output
|
||||
int wake_word_length; // the length of wake word. The unit is the number of samples.
|
||||
int ret_value; // the return state of fetch function
|
||||
int16_t *raw_data; // the multi-channel output data of audio.
|
||||
int raw_data_channels; // the channel number of raw data
|
||||
void *reserved; // reserved for future use
|
||||
} afe_fetch_result_t;
|
||||
|
||||
/**
|
||||
* @brief Function to initialze a AFE_SR instance
|
||||
*
|
||||
*
|
||||
* @param afe_config The config of AFE_SR
|
||||
* @returns Handle to the AFE_SR data
|
||||
*/
|
||||
typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
|
||||
typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of each channel samples per frame that need to be passed to the function
|
||||
@ -71,7 +70,7 @@ typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the channel number
|
||||
*
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of total channels
|
||||
*/
|
||||
@ -92,12 +91,12 @@ typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
|
||||
* The last channel is reference signal if it has reference data.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
*
|
||||
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
|
||||
*
|
||||
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
|
||||
* `get_feed_chunksize`.
|
||||
* @return The size of input
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in);
|
||||
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);
|
||||
|
||||
/**
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR
|
||||
@ -106,9 +105,10 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t*
|
||||
* Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
|
||||
* audio can be queried by the `get_fetch_chunksize`.)
|
||||
*/
|
||||
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
|
||||
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
|
||||
@ -117,9 +117,10 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *af
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result.
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
|
||||
* audio can be queried by the `get_fetch_chunksize`.)
|
||||
*/
|
||||
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
|
||||
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
|
||||
|
||||
/**
|
||||
* @brief reset ringbuf of AFE.
|
||||
@ -130,14 +131,14 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr
|
||||
typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
|
||||
* @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
|
||||
* when wakenet has been initialized. It's only support wakenet 1 now.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param wakenet_word The wakenet word, should be DEFAULT_WAKE_WORD or EXTRA_WAKE_WORD
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name);
|
||||
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name);
|
||||
|
||||
/**
|
||||
* @brief Enable VAD algorithm.
|
||||
@ -179,7 +180,6 @@ typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
|
||||
*/
|
||||
typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a AFE_SR.
|
||||
*/
|
||||
@ -191,11 +191,11 @@ typedef struct {
|
||||
esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
|
||||
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
|
||||
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_wakenet;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_wakenet;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_aec;
|
||||
@ -212,16 +212,14 @@ typedef struct {
|
||||
esp_afe_sr_iface_op_destroy_t destroy;
|
||||
} esp_afe_sr_iface_t;
|
||||
|
||||
|
||||
// struct is used to store the AFE handle and data for the AFE task
|
||||
typedef struct
|
||||
{
|
||||
typedef struct {
|
||||
esp_afe_sr_data_t *afe_data;
|
||||
esp_afe_sr_iface_t *afe_handle;
|
||||
TaskHandle_t feed_task;
|
||||
TaskHandle_t fetch_task;
|
||||
}afe_task_into_t;
|
||||
TaskHandle_t feed_task;
|
||||
TaskHandle_t fetch_task;
|
||||
} afe_task_into_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -10,4 +10,4 @@ esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
89
include/esp32/esp_mfcc_iface.h
Normal file
89
include/esp32/esp_mfcc_iface.h
Normal file
@ -0,0 +1,89 @@
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include "esp_speech_features.h"
|
||||
|
||||
/*
|
||||
This describes an interface for a MFCC runner, that is, some kind of implementation that can be
|
||||
fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so
|
||||
multiple implementations can be used.
|
||||
*/
|
||||
|
||||
|
||||
typedef struct esp_mfcc_data_t esp_mfcc_data_t;
|
||||
|
||||
|
||||
//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please
|
||||
//refer to its documentation for details.
|
||||
typedef struct {
|
||||
int winstep_ms; // The step between successive windows in ms. (10)
|
||||
int winlen_ms; // The length of the analysis window in ms. (25)
|
||||
int nch; // The number of input channel
|
||||
int numcep; // The number of cepstrum to return
|
||||
int nfilter; // The number of filters in the filterbank
|
||||
int nfft; // The FFT size
|
||||
int samp_freq; // The sample-rate of the signal.
|
||||
int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0)
|
||||
int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
|
||||
float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
|
||||
char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey"
|
||||
bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
|
||||
bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum
|
||||
int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
|
||||
float log_epsilon; // log epsilon. (e.g. 1e-7)
|
||||
bool psram_first; // Alloc memory from PSRAM first
|
||||
bool remove_dc_offset; // Whether to subtract mean of wave before FFT
|
||||
} esp_mfcc_opts_t;
|
||||
|
||||
|
||||
/**
|
||||
* @brief Un-initialize and free a mfcc runner
|
||||
*
|
||||
* Function to free a previously allocated mfcc runner.
|
||||
*
|
||||
* @param r Runner object to destroy
|
||||
*/
|
||||
typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
|
||||
|
||||
/**
|
||||
* @brief Initialize parameters for a mfcc runner.
|
||||
*
|
||||
* After creation, a mfcc runner needs to be initialized first; this is usually done
|
||||
* in the initialization routine of a speech recognition algorithm. This provides
|
||||
* a pointer to do this for a specific mfcc runner.
|
||||
*
|
||||
* @param opt Options for the mfcc process
|
||||
* @return True if success, false on error.
|
||||
*/
|
||||
typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
|
||||
|
||||
/**
|
||||
* @brief Run a mfcc iteration on frame by frame
|
||||
*
|
||||
* This will take a set of samples and return a ceptrum. Note that this may be pipelined:
|
||||
* an initial call to this function may return NULL and subsequent calls may return the
|
||||
* cepstrum of previous calls.
|
||||
*
|
||||
* @param r The mfcc runner
|
||||
* @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000).
|
||||
* @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function
|
||||
* when done with this buffer. Note that some implementations require the buffer to be freed before another call
|
||||
* to this function is done.
|
||||
*/
|
||||
typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
|
||||
|
||||
/**
|
||||
* @brief Clean all state of mfcc handle
|
||||
*
|
||||
* @param r The mfcc runner
|
||||
*/
|
||||
typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r);
|
||||
|
||||
/**
|
||||
* @brief Operations possible on a mfcc runner
|
||||
*/
|
||||
typedef struct {
|
||||
esp_mfcc_op_destroy_t destroy;
|
||||
esp_mfcc_op_create_t create;
|
||||
esp_mfcc_op_run_step_t run_step;
|
||||
esp_mfcc_op_clean_t clean;
|
||||
} esp_mfcc_iface_t;
|
||||
40
include/esp32/esp_mfcc_models.h
Normal file
40
include/esp32/esp_mfcc_models.h
Normal file
@ -0,0 +1,40 @@
|
||||
#pragma once
|
||||
#include "esp_mfcc_iface.h"
|
||||
|
||||
|
||||
extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle
|
||||
|
||||
|
||||
/**
|
||||
* @brief Return basic opts used in wakenet9 & multinet5
|
||||
**/
|
||||
esp_mfcc_opts_t *get_mfcc_opts_wn9();
|
||||
|
||||
/**
|
||||
* @brief Return basic opts for default kaldifeat
|
||||
*
|
||||
opts->psram_first = true;
|
||||
opts->use_power = true;
|
||||
opts->use_log_fbank = 2; // log(max(x, log_epsilon))
|
||||
opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps
|
||||
opts->win_type = "povey";
|
||||
opts->low_freq = 20;
|
||||
opts->high_freq = 7600;
|
||||
opts->samp_freq = 16000;
|
||||
opts->nch = 1;
|
||||
opts->nfft = 512;
|
||||
opts->nfilter = 80;
|
||||
opts->numcep = 80;
|
||||
opts->preemph = 0.97;
|
||||
opts->append_energy = false;
|
||||
opts->winlen_ms = 25;
|
||||
opts->winstep_ms = 10;
|
||||
opts->remove_dc_offset = true;
|
||||
*
|
||||
**/
|
||||
esp_mfcc_opts_t *get_mfcc_opts_kaldi();
|
||||
|
||||
/**
|
||||
* @brief Print mfcc opts
|
||||
**/
|
||||
void print_mfcc_opts(esp_mfcc_opts_t *opts);
|
||||
64
include/esp32/esp_speech_features.h
Normal file
64
include/esp32/esp_speech_features.h
Normal file
@ -0,0 +1,64 @@
|
||||
#pragma once
|
||||
#include "c_speech_features_config.h"
|
||||
#include "stdlib.h"
|
||||
#include <assert.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifndef M_2PI
|
||||
#define M_2PI 6.283185307179586476925286766559005
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
{
|
||||
float *coeff;
|
||||
int *bank_pos;
|
||||
int nfilter;
|
||||
} esp_mel_filter_t;
|
||||
|
||||
float* esp_mfcc_malloc(size_t size, bool from_psram);
|
||||
|
||||
void esp_mfcc_free(void *ptr);
|
||||
|
||||
/**
|
||||
* @brief Initialize FFT table
|
||||
* @warning For ESP-PLATFORM, use esp-dsp fft
|
||||
* For Other platform, use kiss fft
|
||||
*
|
||||
* @param nfft The input samples number
|
||||
* @return fft-table
|
||||
**/
|
||||
void* esp_fft_init(int nfft);
|
||||
|
||||
/**
|
||||
* @brief Free FFT table
|
||||
* @warning For ESP-PLATFORM, use esp-dsp fft
|
||||
* For Other platform, use kiss fft
|
||||
*
|
||||
* @param fft_table The fft table initialized by esp_fft_init
|
||||
* @param nfft The input samples number
|
||||
* @return fft-table
|
||||
**/
|
||||
void esp_fft_deinit(void *fft_table, int nfft);
|
||||
|
||||
/**
|
||||
* @brief Initial window function
|
||||
* Currently support hanning, hamming, sine, povey, rectangular,
|
||||
* wn9(512-hanning to get wakenet9& multinet5 compatible)
|
||||
**/
|
||||
float *esp_win_func_init(char *win_type, float* window_data, int frame_length);
|
||||
|
||||
float* esp_fftr(float* x, int nfft, void *fft_table);
|
||||
|
||||
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
|
||||
|
||||
void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
|
||||
|
||||
float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);
|
||||
|
||||
esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq,
|
||||
bool from_psram);
|
||||
|
||||
void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);
|
||||
|
||||
float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank,
|
||||
float epsilon);
|
||||
29
include/esp32p4/c_speech_features_config.h
Normal file
29
include/esp32p4/c_speech_features_config.h
Normal file
@ -0,0 +1,29 @@
|
||||
#pragma once
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
|
||||
/* #undef ENABLE_DOUBLE */
|
||||
|
||||
#ifdef ENABLE_DOUBLE
|
||||
# define csf_float double
|
||||
# define csf_ceil ceil
|
||||
# define csf_floor floor
|
||||
# define csf_sin sin
|
||||
# define csf_log log
|
||||
# define csf_log10 log10
|
||||
# define csf_pow pow
|
||||
# define csf_sqrt sqrt
|
||||
# define csf_abs fabs
|
||||
# define csf_float_min DBL_MIN
|
||||
#else
|
||||
# define csf_float float
|
||||
# define csf_ceil ceilf
|
||||
# define csf_floor floorf
|
||||
# define csf_sin sinf
|
||||
# define csf_log logf
|
||||
# define csf_log10 log10f
|
||||
# define csf_pow powf
|
||||
# define csf_sqrt sqrtf
|
||||
# define csf_abs fabsf
|
||||
# define csf_float_min FLT_MIN
|
||||
#endif
|
||||
@ -1,241 +1,245 @@
|
||||
#pragma once
|
||||
#include "stdint.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdlib.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#include "esp_wn_models.h"
|
||||
#include "esp_vad.h"
|
||||
#include "esp_aec.h"
|
||||
#include "esp_agc.h"
|
||||
#include "model_path.h"
|
||||
#include "esp_vadn_models.h"
|
||||
#include "esp_nsn_models.h"
|
||||
#include "esp_vad.h"
|
||||
#include "esp_vadn_models.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#include "esp_wn_models.h"
|
||||
#include "model_path.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdint.h"
|
||||
#include "stdlib.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//AFE: Audio Front-End
|
||||
//SR: Speech Recognition
|
||||
//VC: Voice Communication
|
||||
// AFE: Audio Front-End
|
||||
// SR: Speech Recognition
|
||||
// VC: Voice Communication
|
||||
|
||||
//Set AFE_SR mode
|
||||
// Set AFE_SR mode
|
||||
typedef enum {
|
||||
SR_MODE_LOW_COST = 0, //Deprecated, please use afe_mode_t, AFE mode: low cost mode
|
||||
SR_MODE_HIGH_PERF = 1, //Deprecated, please use afe_mode_t, AFE mode: high performance mode
|
||||
SR_MODE_LOW_COST = 0, // Deprecated, please use afe_mode_t, AFE mode: low cost mode
|
||||
SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode
|
||||
} afe_sr_mode_t;
|
||||
|
||||
//Set AFE mode
|
||||
// Set AFE mode
|
||||
typedef enum {
|
||||
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
|
||||
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
|
||||
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
|
||||
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
|
||||
} afe_mode_t;
|
||||
|
||||
//Set AFE type
|
||||
// Set AFE type
|
||||
typedef enum {
|
||||
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
|
||||
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
|
||||
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
|
||||
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
|
||||
} afe_type_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
|
||||
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
|
||||
AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram
|
||||
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
|
||||
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
|
||||
AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram
|
||||
} afe_memory_alloc_mode_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
|
||||
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
|
||||
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
|
||||
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
|
||||
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
|
||||
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
|
||||
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
|
||||
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
|
||||
} afe_mn_peak_agc_mode_t;
|
||||
|
||||
typedef struct {
|
||||
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
|
||||
int mic_num; // microphone channel number
|
||||
uint8_t* mic_ids; // microphone channel indices
|
||||
int ref_num; // playback reference channel number
|
||||
uint8_t* ref_ids; // playback reference channel indices
|
||||
int sample_rate; // sample rate of audio
|
||||
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
|
||||
int mic_num; // microphone channel number
|
||||
uint8_t *mic_ids; // microphone channel indices
|
||||
int ref_num; // playback reference channel number
|
||||
uint8_t *ref_ids; // playback reference channel indices
|
||||
int sample_rate; // sample rate of audio
|
||||
} afe_pcm_config_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
|
||||
AFE_NS_MODE_NET = 1, // please use model name of NSNET
|
||||
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
|
||||
AFE_NS_MODE_NET = 1, // please use model name of NSNET
|
||||
} afe_ns_mode_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
|
||||
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
|
||||
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
|
||||
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
|
||||
} afe_agc_mode_t;
|
||||
|
||||
/**
|
||||
* @brief Function to get the debug audio data
|
||||
*
|
||||
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that avoid blocking for too long.
|
||||
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that
|
||||
* avoid blocking for too long.
|
||||
* @param data_size The number of bytes of data.
|
||||
* @returns
|
||||
*/
|
||||
typedef void (*afe_debug_hook_callback_t)(const int16_t* data, int data_size);
|
||||
typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);
|
||||
|
||||
typedef enum {
|
||||
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
|
||||
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
|
||||
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
|
||||
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
|
||||
AFE_DEBUG_HOOK_MAX = 2
|
||||
} afe_debug_hook_type_t;
|
||||
|
||||
typedef struct {
|
||||
afe_debug_hook_type_t hook_type; // debug type of hook
|
||||
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
|
||||
afe_debug_hook_type_t hook_type; // debug type of hook
|
||||
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
|
||||
} afe_debug_hook_t;
|
||||
|
||||
typedef struct {
|
||||
/********** AEC(Acoustic Echo Cancellation) **********/
|
||||
bool aec_init; // Whether to init aec
|
||||
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
|
||||
int aec_filter_length; // The filter length of aec
|
||||
bool aec_init; // Whether to init aec
|
||||
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
|
||||
int aec_filter_length; // The filter length of aec
|
||||
|
||||
/********** SE(Speech Enhancement, microphone array processing) **********/
|
||||
bool se_init; // Whether to init se
|
||||
bool se_init; // Whether to init se
|
||||
|
||||
/********** NS(Noise Suppression) **********/
|
||||
bool ns_init; // Whether to init ns
|
||||
char *ns_model_name; // Model name of ns
|
||||
afe_ns_mode_t afe_ns_mode; // Model mode of ns
|
||||
|
||||
bool ns_init; // Whether to init ns
|
||||
char *ns_model_name; // Model name of ns
|
||||
afe_ns_mode_t afe_ns_mode; // Model mode of ns
|
||||
|
||||
/********** VAD(Voice Activity Detection) **********/
|
||||
bool vad_init; // Whether to init vad
|
||||
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
|
||||
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
|
||||
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
|
||||
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms
|
||||
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
|
||||
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
|
||||
bool vad_init; // Whether to init vad
|
||||
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
|
||||
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
|
||||
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
|
||||
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
|
||||
// 1000 ms
|
||||
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
|
||||
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
|
||||
|
||||
/********** WakeNet(Wake Word Engine) **********/
|
||||
bool wakenet_init;
|
||||
char *wakenet_model_name; // The model name of wakenet 1
|
||||
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
|
||||
det_mode_t wakenet_mode; // The mode of wakenet
|
||||
char *wakenet_model_name; // The model name of wakenet 1
|
||||
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
|
||||
det_mode_t wakenet_mode; // The mode of wakenet
|
||||
|
||||
/********** AGC(Automatic Gain Control) **********/
|
||||
bool agc_init; // Whether to init agc
|
||||
afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
|
||||
int agc_compression_gain_db; // Compression gain in dB (default 9)
|
||||
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
|
||||
bool agc_init; // Whether to init agc
|
||||
afe_agc_mode_t
|
||||
agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
|
||||
int agc_compression_gain_db; // Compression gain in dB (default 9)
|
||||
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
|
||||
|
||||
/********** General AFE(Audio Front End) parameter **********/
|
||||
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
|
||||
afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
|
||||
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
|
||||
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
|
||||
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
|
||||
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude.
|
||||
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
|
||||
afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
|
||||
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
|
||||
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
|
||||
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
|
||||
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
|
||||
// directly on the output amplitude: out_linear_gain * amplitude.
|
||||
bool debug_init;
|
||||
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
|
||||
// otherwise, select channel number by wakenet
|
||||
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
|
||||
// otherwise, select channel number by wakenet
|
||||
} afe_config_t;
|
||||
|
||||
/**
|
||||
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format.
|
||||
* You can manually fine-tune it after creating the configuration
|
||||
*
|
||||
* The input format:
|
||||
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
|
||||
* on the chip target and input format. You can manually fine-tune it after creating the configuration
|
||||
*
|
||||
* The input format:
|
||||
* M to represent the microphone channel
|
||||
* R to represent the playback reference channel
|
||||
* N to represent an unknown or unused channel
|
||||
*
|
||||
* For example, input_format="MMNR" indicates that the input data consists of four channels,
|
||||
*
|
||||
* For example, input_format="MMNR" indicates that the input data consists of four channels,
|
||||
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
|
||||
*
|
||||
*
|
||||
* @param input_format The input format
|
||||
* @param models Models from partition, which is configured by Kconfig
|
||||
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
|
||||
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
*
|
||||
*
|
||||
* @return afe_config_t* The default config of afe
|
||||
*/
|
||||
afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
|
||||
|
||||
/**
|
||||
* @brief Check AFE configuration and make sure it is correct.
|
||||
*
|
||||
* @warning If there is a configuration conflict, this function will modify some parameters.
|
||||
*
|
||||
* @warning If there is a configuration conflict, this function will modify some parameters.
|
||||
* The guiding behind these modifications is to maintain the highest performance of the output audio and results.
|
||||
* And remove the conflict between different algorithms.
|
||||
*
|
||||
*
|
||||
* For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
|
||||
* If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
|
||||
*
|
||||
*
|
||||
* @param afe_config Input AFE config
|
||||
*
|
||||
*
|
||||
* @return afe_config_t* The modified AFE config
|
||||
*/
|
||||
afe_config_t *afe_config_check(afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Parse input format
|
||||
*
|
||||
*
|
||||
* @param input_format The input format, same with afe_config_init() function
|
||||
* @param pcm_config The pcm config
|
||||
*
|
||||
*
|
||||
* @return true if the input format is parsed successfully, otherwise false
|
||||
*/
|
||||
bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config);
|
||||
bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);
|
||||
|
||||
/**
|
||||
* @brief Parse I2S input data
|
||||
*
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param mic_data The output microphone data
|
||||
* @param ref_data The output playback reference data
|
||||
* @param pcm_config The pcm config
|
||||
*
|
||||
*
|
||||
*/
|
||||
void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config);
|
||||
void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);
|
||||
|
||||
/**
|
||||
* @brief Parse input data, from interleaved arrangement to contiguous arrangement
|
||||
*
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param channel_num The channel number of data
|
||||
* @param out_data The output data
|
||||
*
|
||||
*
|
||||
*/
|
||||
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
|
||||
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Format input data, from contiguous arrangement to interleaved arrangement
|
||||
*
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param channel_num The channel number of data
|
||||
* @param out_data The output data
|
||||
*
|
||||
*
|
||||
*/
|
||||
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
|
||||
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Adjust the gain of input data
|
||||
*
|
||||
*
|
||||
* @warning the input data will be modified inplace.
|
||||
*
|
||||
*
|
||||
* @param data The input audio data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param factor The gain factor
|
||||
*
|
||||
*
|
||||
* @return int16_t* The output audio data
|
||||
*/
|
||||
int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
|
||||
int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);
|
||||
|
||||
/**
|
||||
* @brief Adjust the gain of input data
|
||||
*
|
||||
*
|
||||
* @warning the input data will be modified inplace.
|
||||
*
|
||||
*
|
||||
* @param in_data The input audio data
|
||||
* @param in_frame_size Input data frame size of input
|
||||
* @param channel_num The channel number of input data, which is same as output data
|
||||
@ -243,35 +247,35 @@ int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
|
||||
* @param out_frame_size Onput data frame size of input
|
||||
*
|
||||
*/
|
||||
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size);
|
||||
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);
|
||||
|
||||
/**
|
||||
* @brief Copy the afe config
|
||||
*
|
||||
*
|
||||
* @param dst_config The destination afe config
|
||||
* @param src_config The source afe config
|
||||
*
|
||||
*
|
||||
* @return The destination afe config
|
||||
*/
|
||||
afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
|
||||
afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
|
||||
|
||||
/**
|
||||
* @brief Print the afe config
|
||||
*
|
||||
*
|
||||
* @param afe_config The afe config
|
||||
*/
|
||||
void afe_config_print(const afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Allocate afe config
|
||||
*
|
||||
*
|
||||
* @return The afe config pointer
|
||||
*/
|
||||
afe_config_t *afe_config_alloc();
|
||||
|
||||
/**
|
||||
* @brief Free afe config
|
||||
*
|
||||
*
|
||||
* @param afe_config The afe config pointer
|
||||
*/
|
||||
void afe_config_free(afe_config_t *afe_config);
|
||||
|
||||
@ -1,62 +1,61 @@
|
||||
#pragma once
|
||||
#include "esp_afe_config.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdint.h"
|
||||
#include "stdlib.h"
|
||||
#include "stdbool.h"
|
||||
#include "esp_afe_config.h"
|
||||
#include "freertos/FreeRTOS.h"
|
||||
#include "freertos/task.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//AFE: Audio Front-End
|
||||
//SR: Speech Recognition
|
||||
//afe_sr/AFE_SR: the audio front-end for speech recognition
|
||||
// AFE: Audio Front-End
|
||||
// SR: Speech Recognition
|
||||
// afe_sr/AFE_SR: the audio front-end for speech recognition
|
||||
|
||||
//Opaque AFE_SR data container
|
||||
// Opaque AFE_SR data container
|
||||
typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* @brief The state of vad
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
|
||||
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
|
||||
typedef enum {
|
||||
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
|
||||
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
|
||||
} afe_vad_state_t;
|
||||
|
||||
/**
|
||||
* @brief The result of fetch function
|
||||
*/
|
||||
typedef struct afe_fetch_result_t
|
||||
{
|
||||
int16_t *data; // the target channel data of audio.
|
||||
int data_size; // the size of data. The unit is byte.
|
||||
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
|
||||
int vad_cache_size; // the size of vad_cache. The unit is byte.
|
||||
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc).
|
||||
// if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length.
|
||||
wakenet_state_t wakeup_state; // the value is wakenet_state_t
|
||||
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
|
||||
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
|
||||
vad_state_t vad_state; // the value is afe_vad_state_t
|
||||
int trigger_channel_id; // the channel index of output
|
||||
int wake_word_length; // the length of wake word. The unit is the number of samples.
|
||||
int ret_value; // the return state of fetch function
|
||||
int16_t *raw_data; // the multi-channel output data of audio.
|
||||
int raw_data_channels; // the channel number of raw data
|
||||
void* reserved; // reserved for future use
|
||||
typedef struct afe_fetch_result_t {
|
||||
int16_t *data; // the target channel data of audio.
|
||||
int data_size; // the size of data. The unit is byte.
|
||||
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
|
||||
// audio that was truncated.
|
||||
int vad_cache_size; // the size of vad_cache. The unit is byte.
|
||||
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
|
||||
// (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
|
||||
// wakenet(about 1.5s), otherwise is the frame length.
|
||||
wakenet_state_t wakeup_state; // the value is wakenet_state_t
|
||||
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
|
||||
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
|
||||
// start from 1.
|
||||
vad_state_t vad_state; // the value is afe_vad_state_t
|
||||
int trigger_channel_id; // the channel index of output
|
||||
int wake_word_length; // the length of wake word. The unit is the number of samples.
|
||||
int ret_value; // the return state of fetch function
|
||||
int16_t *raw_data; // the multi-channel output data of audio.
|
||||
int raw_data_channels; // the channel number of raw data
|
||||
void *reserved; // reserved for future use
|
||||
} afe_fetch_result_t;
|
||||
|
||||
/**
|
||||
* @brief Function to initialze a AFE_SR instance
|
||||
*
|
||||
*
|
||||
* @param afe_config The config of AFE_SR
|
||||
* @returns Handle to the AFE_SR data
|
||||
*/
|
||||
typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
|
||||
typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of each channel samples per frame that need to be passed to the function
|
||||
@ -71,7 +70,7 @@ typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the channel number
|
||||
*
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of total channels
|
||||
*/
|
||||
@ -92,12 +91,12 @@ typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
|
||||
* The last channel is reference signal if it has reference data.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
*
|
||||
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
|
||||
*
|
||||
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
|
||||
* `get_feed_chunksize`.
|
||||
* @return The size of input
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in);
|
||||
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);
|
||||
|
||||
/**
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR
|
||||
@ -106,9 +105,10 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t*
|
||||
* Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
|
||||
* audio can be queried by the `get_fetch_chunksize`.)
|
||||
*/
|
||||
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
|
||||
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
|
||||
@ -117,9 +117,10 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *af
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result.
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
|
||||
* audio can be queried by the `get_fetch_chunksize`.)
|
||||
*/
|
||||
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
|
||||
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
|
||||
|
||||
/**
|
||||
* @brief reset ringbuf of AFE.
|
||||
@ -130,14 +131,14 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr
|
||||
typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
|
||||
* @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
|
||||
* when wakenet has been initialized. It's only support wakenet 1 now.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param wakenet_word The wakenet word, should be DEFAULT_WAKE_WORD or EXTRA_WAKE_WORD
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name);
|
||||
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name);
|
||||
|
||||
/**
|
||||
* @brief Enable VAD algorithm.
|
||||
@ -179,7 +180,6 @@ typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
|
||||
*/
|
||||
typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a AFE_SR.
|
||||
*/
|
||||
@ -191,11 +191,11 @@ typedef struct {
|
||||
esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
|
||||
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
|
||||
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_wakenet;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_wakenet;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_aec;
|
||||
@ -212,16 +212,14 @@ typedef struct {
|
||||
esp_afe_sr_iface_op_destroy_t destroy;
|
||||
} esp_afe_sr_iface_t;
|
||||
|
||||
|
||||
// struct is used to store the AFE handle and data for the AFE task
|
||||
typedef struct
|
||||
{
|
||||
typedef struct {
|
||||
esp_afe_sr_data_t *afe_data;
|
||||
esp_afe_sr_iface_t *afe_handle;
|
||||
TaskHandle_t feed_task;
|
||||
TaskHandle_t fetch_task;
|
||||
}afe_task_into_t;
|
||||
TaskHandle_t feed_task;
|
||||
TaskHandle_t fetch_task;
|
||||
} afe_task_into_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -10,4 +10,4 @@ esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
89
include/esp32p4/esp_mfcc_iface.h
Normal file
89
include/esp32p4/esp_mfcc_iface.h
Normal file
@ -0,0 +1,89 @@
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include "esp_speech_features.h"
|
||||
|
||||
/*
|
||||
This describes an interface for a MFCC runner, that is, some kind of implementation that can be
|
||||
fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so
|
||||
multiple implementations can be used.
|
||||
*/
|
||||
|
||||
|
||||
typedef struct esp_mfcc_data_t esp_mfcc_data_t;
|
||||
|
||||
|
||||
//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please
|
||||
//refer to its documentation for details.
|
||||
typedef struct {
|
||||
int winstep_ms; // The step between successive windows in ms. (10)
|
||||
int winlen_ms; // The length of the analysis window in ms. (25)
|
||||
int nch; // The number of input channel
|
||||
int numcep; // The number of cepstrum to return
|
||||
int nfilter; // The number of filters in the filterbank
|
||||
int nfft; // The FFT size
|
||||
int samp_freq; // The sample-rate of the signal.
|
||||
int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0)
|
||||
int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
|
||||
float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
|
||||
char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey"
|
||||
bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
|
||||
bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum
|
||||
int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
|
||||
float log_epsilon; // log epsilon. (e.g. 1e-7)
|
||||
bool psram_first; // Alloc memory from PSRAM first
|
||||
bool remove_dc_offset; // Whether to subtract mean of wave before FFT
|
||||
} esp_mfcc_opts_t;
|
||||
|
||||
|
||||
/**
|
||||
* @brief Un-initialize and free a mfcc runner
|
||||
*
|
||||
* Function to free a previously allocated mfcc runner.
|
||||
*
|
||||
* @param r Runner object to destroy
|
||||
*/
|
||||
typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
|
||||
|
||||
/**
|
||||
* @brief Initialize parameters for a mfcc runner.
|
||||
*
|
||||
* After creation, a mfcc runner needs to be initialized first; this is usually done
|
||||
* in the initialization routine of a speech recognition algorithm. This provides
|
||||
* a pointer to do this for a specific mfcc runner.
|
||||
*
|
||||
* @param opt Options for the mfcc process
|
||||
* @return True if success, false on error.
|
||||
*/
|
||||
typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
|
||||
|
||||
/**
|
||||
* @brief Run a mfcc iteration on frame by frame
|
||||
*
|
||||
* This will take a set of samples and return a ceptrum. Note that this may be pipelined:
|
||||
* an initial call to this function may return NULL and subsequent calls may return the
|
||||
* cepstrum of previous calls.
|
||||
*
|
||||
* @param r The mfcc runner
|
||||
* @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000).
|
||||
* @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function
|
||||
* when done with this buffer. Note that some implementations require the buffer to be freed before another call
|
||||
* to this function is done.
|
||||
*/
|
||||
typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
|
||||
|
||||
/**
|
||||
* @brief Clean all state of mfcc handle
|
||||
*
|
||||
* @param r The mfcc runner
|
||||
*/
|
||||
typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r);
|
||||
|
||||
/**
|
||||
* @brief Operations possible on a mfcc runner
|
||||
*/
|
||||
typedef struct {
|
||||
esp_mfcc_op_destroy_t destroy;
|
||||
esp_mfcc_op_create_t create;
|
||||
esp_mfcc_op_run_step_t run_step;
|
||||
esp_mfcc_op_clean_t clean;
|
||||
} esp_mfcc_iface_t;
|
||||
40
include/esp32p4/esp_mfcc_models.h
Normal file
40
include/esp32p4/esp_mfcc_models.h
Normal file
@ -0,0 +1,40 @@
|
||||
#pragma once
|
||||
#include "esp_mfcc_iface.h"
|
||||
|
||||
|
||||
extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle
|
||||
|
||||
|
||||
/**
|
||||
* @brief Return basic opts used in wakenet9 & multinet5
|
||||
**/
|
||||
esp_mfcc_opts_t *get_mfcc_opts_wn9();
|
||||
|
||||
/**
|
||||
* @brief Return basic opts for default kaldifeat
|
||||
*
|
||||
opts->psram_first = true;
|
||||
opts->use_power = true;
|
||||
opts->use_log_fbank = 2; // log(max(x, log_epsilon))
|
||||
opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps
|
||||
opts->win_type = "povey";
|
||||
opts->low_freq = 20;
|
||||
opts->high_freq = 7600;
|
||||
opts->samp_freq = 16000;
|
||||
opts->nch = 1;
|
||||
opts->nfft = 512;
|
||||
opts->nfilter = 80;
|
||||
opts->numcep = 80;
|
||||
opts->preemph = 0.97;
|
||||
opts->append_energy = false;
|
||||
opts->winlen_ms = 25;
|
||||
opts->winstep_ms = 10;
|
||||
opts->remove_dc_offset = true;
|
||||
*
|
||||
**/
|
||||
esp_mfcc_opts_t *get_mfcc_opts_kaldi();
|
||||
|
||||
/**
|
||||
* @brief Print mfcc opts
|
||||
**/
|
||||
void print_mfcc_opts(esp_mfcc_opts_t *opts);
|
||||
64
include/esp32p4/esp_speech_features.h
Normal file
64
include/esp32p4/esp_speech_features.h
Normal file
@ -0,0 +1,64 @@
|
||||
#pragma once
|
||||
#include "c_speech_features_config.h"
|
||||
#include "stdlib.h"
|
||||
#include <assert.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifndef M_2PI
|
||||
#define M_2PI 6.283185307179586476925286766559005
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
{
|
||||
float *coeff;
|
||||
int *bank_pos;
|
||||
int nfilter;
|
||||
} esp_mel_filter_t;
|
||||
|
||||
float* esp_mfcc_malloc(size_t size, bool from_psram);
|
||||
|
||||
void esp_mfcc_free(void *ptr);
|
||||
|
||||
/**
|
||||
* @brief Initialize FFT table
|
||||
* @warning For ESP-PLATFORM, use esp-dsp fft
|
||||
* For Other platform, use kiss fft
|
||||
*
|
||||
* @param nfft The input samples number
|
||||
* @return fft-table
|
||||
**/
|
||||
void* esp_fft_init(int nfft);
|
||||
|
||||
/**
|
||||
* @brief Free FFT table
|
||||
* @warning For ESP-PLATFORM, use esp-dsp fft
|
||||
* For Other platform, use kiss fft
|
||||
*
|
||||
* @param fft_table The fft table initialized by esp_fft_init
|
||||
* @param nfft The input samples number
|
||||
* @return fft-table
|
||||
**/
|
||||
void esp_fft_deinit(void *fft_table, int nfft);
|
||||
|
||||
/**
|
||||
* @brief Initial window function
|
||||
* Currently support hanning, hamming, sine, povey, rectangular,
|
||||
* wn9(512-hanning to get wakenet9& multinet5 compatible)
|
||||
**/
|
||||
float *esp_win_func_init(char *win_type, float* window_data, int frame_length);
|
||||
|
||||
float* esp_fftr(float* x, int nfft, void *fft_table);
|
||||
|
||||
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
|
||||
|
||||
void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
|
||||
|
||||
float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);
|
||||
|
||||
esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq,
|
||||
bool from_psram);
|
||||
|
||||
void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);
|
||||
|
||||
float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank,
|
||||
float epsilon);
|
||||
29
include/esp32s3/c_speech_features_config.h
Normal file
29
include/esp32s3/c_speech_features_config.h
Normal file
@ -0,0 +1,29 @@
|
||||
#pragma once
|
||||
#include <float.h>
|
||||
#include <math.h>
|
||||
|
||||
/* #undef ENABLE_DOUBLE */
|
||||
|
||||
#ifdef ENABLE_DOUBLE
|
||||
# define csf_float double
|
||||
# define csf_ceil ceil
|
||||
# define csf_floor floor
|
||||
# define csf_sin sin
|
||||
# define csf_log log
|
||||
# define csf_log10 log10
|
||||
# define csf_pow pow
|
||||
# define csf_sqrt sqrt
|
||||
# define csf_abs fabs
|
||||
# define csf_float_min DBL_MIN
|
||||
#else
|
||||
# define csf_float float
|
||||
# define csf_ceil ceilf
|
||||
# define csf_floor floorf
|
||||
# define csf_sin sinf
|
||||
# define csf_log logf
|
||||
# define csf_log10 log10f
|
||||
# define csf_pow powf
|
||||
# define csf_sqrt sqrtf
|
||||
# define csf_abs fabsf
|
||||
# define csf_float_min FLT_MIN
|
||||
#endif
|
||||
@ -1,241 +1,245 @@
|
||||
#pragma once
|
||||
#include "stdint.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdlib.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#include "esp_wn_models.h"
|
||||
#include "esp_vad.h"
|
||||
#include "esp_aec.h"
|
||||
#include "esp_agc.h"
|
||||
#include "model_path.h"
|
||||
#include "esp_vadn_models.h"
|
||||
#include "esp_nsn_models.h"
|
||||
#include "esp_vad.h"
|
||||
#include "esp_vadn_models.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#include "esp_wn_models.h"
|
||||
#include "model_path.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdint.h"
|
||||
#include "stdlib.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//AFE: Audio Front-End
|
||||
//SR: Speech Recognition
|
||||
//VC: Voice Communication
|
||||
// AFE: Audio Front-End
|
||||
// SR: Speech Recognition
|
||||
// VC: Voice Communication
|
||||
|
||||
//Set AFE_SR mode
|
||||
// Set AFE_SR mode
|
||||
typedef enum {
|
||||
SR_MODE_LOW_COST = 0, //Deprecated, please use afe_mode_t, AFE mode: low cost mode
|
||||
SR_MODE_HIGH_PERF = 1, //Deprecated, please use afe_mode_t, AFE mode: high performance mode
|
||||
SR_MODE_LOW_COST = 0, // Deprecated, please use afe_mode_t, AFE mode: low cost mode
|
||||
SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode
|
||||
} afe_sr_mode_t;
|
||||
|
||||
//Set AFE mode
|
||||
// Set AFE mode
|
||||
typedef enum {
|
||||
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
|
||||
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
|
||||
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
|
||||
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
|
||||
} afe_mode_t;
|
||||
|
||||
//Set AFE type
|
||||
// Set AFE type
|
||||
typedef enum {
|
||||
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
|
||||
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
|
||||
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
|
||||
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
|
||||
} afe_type_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
|
||||
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
|
||||
AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram
|
||||
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
|
||||
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
|
||||
AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram
|
||||
} afe_memory_alloc_mode_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
|
||||
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
|
||||
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
|
||||
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
|
||||
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
|
||||
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
|
||||
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
|
||||
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
|
||||
} afe_mn_peak_agc_mode_t;
|
||||
|
||||
typedef struct {
|
||||
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
|
||||
int mic_num; // microphone channel number
|
||||
uint8_t* mic_ids; // microphone channel indices
|
||||
int ref_num; // playback reference channel number
|
||||
uint8_t* ref_ids; // playback reference channel indices
|
||||
int sample_rate; // sample rate of audio
|
||||
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
|
||||
int mic_num; // microphone channel number
|
||||
uint8_t *mic_ids; // microphone channel indices
|
||||
int ref_num; // playback reference channel number
|
||||
uint8_t *ref_ids; // playback reference channel indices
|
||||
int sample_rate; // sample rate of audio
|
||||
} afe_pcm_config_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
|
||||
AFE_NS_MODE_NET = 1, // please use model name of NSNET
|
||||
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
|
||||
AFE_NS_MODE_NET = 1, // please use model name of NSNET
|
||||
} afe_ns_mode_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
|
||||
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
|
||||
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
|
||||
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
|
||||
} afe_agc_mode_t;
|
||||
|
||||
/**
|
||||
* @brief Function to get the debug audio data
|
||||
*
|
||||
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that avoid blocking for too long.
|
||||
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that
|
||||
* avoid blocking for too long.
|
||||
* @param data_size The number of bytes of data.
|
||||
* @returns
|
||||
*/
|
||||
typedef void (*afe_debug_hook_callback_t)(const int16_t* data, int data_size);
|
||||
typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);
|
||||
|
||||
typedef enum {
|
||||
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
|
||||
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
|
||||
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
|
||||
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
|
||||
AFE_DEBUG_HOOK_MAX = 2
|
||||
} afe_debug_hook_type_t;
|
||||
|
||||
typedef struct {
|
||||
afe_debug_hook_type_t hook_type; // debug type of hook
|
||||
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
|
||||
afe_debug_hook_type_t hook_type; // debug type of hook
|
||||
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
|
||||
} afe_debug_hook_t;
|
||||
|
||||
typedef struct {
|
||||
/********** AEC(Acoustic Echo Cancellation) **********/
|
||||
bool aec_init; // Whether to init aec
|
||||
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
|
||||
int aec_filter_length; // The filter length of aec
|
||||
bool aec_init; // Whether to init aec
|
||||
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
|
||||
int aec_filter_length; // The filter length of aec
|
||||
|
||||
/********** SE(Speech Enhancement, microphone array processing) **********/
|
||||
bool se_init; // Whether to init se
|
||||
bool se_init; // Whether to init se
|
||||
|
||||
/********** NS(Noise Suppression) **********/
|
||||
bool ns_init; // Whether to init ns
|
||||
char *ns_model_name; // Model name of ns
|
||||
afe_ns_mode_t afe_ns_mode; // Model mode of ns
|
||||
|
||||
bool ns_init; // Whether to init ns
|
||||
char *ns_model_name; // Model name of ns
|
||||
afe_ns_mode_t afe_ns_mode; // Model mode of ns
|
||||
|
||||
/********** VAD(Voice Activity Detection) **********/
|
||||
bool vad_init; // Whether to init vad
|
||||
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
|
||||
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
|
||||
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
|
||||
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms
|
||||
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
|
||||
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
|
||||
bool vad_init; // Whether to init vad
|
||||
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
|
||||
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
|
||||
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
|
||||
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
|
||||
// 1000 ms
|
||||
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
|
||||
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
|
||||
|
||||
/********** WakeNet(Wake Word Engine) **********/
|
||||
bool wakenet_init;
|
||||
char *wakenet_model_name; // The model name of wakenet 1
|
||||
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
|
||||
det_mode_t wakenet_mode; // The mode of wakenet
|
||||
char *wakenet_model_name; // The model name of wakenet 1
|
||||
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
|
||||
det_mode_t wakenet_mode; // The mode of wakenet
|
||||
|
||||
/********** AGC(Automatic Gain Control) **********/
|
||||
bool agc_init; // Whether to init agc
|
||||
afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
|
||||
int agc_compression_gain_db; // Compression gain in dB (default 9)
|
||||
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
|
||||
bool agc_init; // Whether to init agc
|
||||
afe_agc_mode_t
|
||||
agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
|
||||
int agc_compression_gain_db; // Compression gain in dB (default 9)
|
||||
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
|
||||
|
||||
/********** General AFE(Audio Front End) parameter **********/
|
||||
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
|
||||
afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
|
||||
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
|
||||
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
|
||||
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
|
||||
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude.
|
||||
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
|
||||
afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
|
||||
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
|
||||
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
|
||||
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
|
||||
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
|
||||
// directly on the output amplitude: out_linear_gain * amplitude.
|
||||
bool debug_init;
|
||||
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
|
||||
// otherwise, select channel number by wakenet
|
||||
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
|
||||
// otherwise, select channel number by wakenet
|
||||
} afe_config_t;
|
||||
|
||||
/**
|
||||
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format.
|
||||
* You can manually fine-tune it after creating the configuration
|
||||
*
|
||||
* The input format:
|
||||
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
|
||||
* on the chip target and input format. You can manually fine-tune it after creating the configuration
|
||||
*
|
||||
* The input format:
|
||||
* M to represent the microphone channel
|
||||
* R to represent the playback reference channel
|
||||
* N to represent an unknown or unused channel
|
||||
*
|
||||
* For example, input_format="MMNR" indicates that the input data consists of four channels,
|
||||
*
|
||||
* For example, input_format="MMNR" indicates that the input data consists of four channels,
|
||||
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
|
||||
*
|
||||
*
|
||||
* @param input_format The input format
|
||||
* @param models Models from partition, which is configured by Kconfig
|
||||
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
|
||||
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
*
|
||||
*
|
||||
* @return afe_config_t* The default config of afe
|
||||
*/
|
||||
afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
|
||||
|
||||
/**
|
||||
* @brief Check AFE configuration and make sure it is correct.
|
||||
*
|
||||
* @warning If there is a configuration conflict, this function will modify some parameters.
|
||||
*
|
||||
* @warning If there is a configuration conflict, this function will modify some parameters.
|
||||
* The guiding behind these modifications is to maintain the highest performance of the output audio and results.
|
||||
* And remove the conflict between different algorithms.
|
||||
*
|
||||
*
|
||||
* For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
|
||||
* If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
|
||||
*
|
||||
*
|
||||
* @param afe_config Input AFE config
|
||||
*
|
||||
*
|
||||
* @return afe_config_t* The modified AFE config
|
||||
*/
|
||||
afe_config_t *afe_config_check(afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Parse input format
|
||||
*
|
||||
*
|
||||
* @param input_format The input format, same with afe_config_init() function
|
||||
* @param pcm_config The pcm config
|
||||
*
|
||||
*
|
||||
* @return true if the input format is parsed successfully, otherwise false
|
||||
*/
|
||||
bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config);
|
||||
bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);
|
||||
|
||||
/**
|
||||
* @brief Parse I2S input data
|
||||
*
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param mic_data The output microphone data
|
||||
* @param ref_data The output playback reference data
|
||||
* @param pcm_config The pcm config
|
||||
*
|
||||
*
|
||||
*/
|
||||
void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config);
|
||||
void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);
|
||||
|
||||
/**
|
||||
* @brief Parse input data, from interleaved arrangement to contiguous arrangement
|
||||
*
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param channel_num The channel number of data
|
||||
* @param out_data The output data
|
||||
*
|
||||
*
|
||||
*/
|
||||
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
|
||||
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Format input data, from contiguous arrangement to interleaved arrangement
|
||||
*
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param channel_num The channel number of data
|
||||
* @param out_data The output data
|
||||
*
|
||||
*
|
||||
*/
|
||||
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
|
||||
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Adjust the gain of input data
|
||||
*
|
||||
*
|
||||
* @warning the input data will be modified inplace.
|
||||
*
|
||||
*
|
||||
* @param data The input audio data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param factor The gain factor
|
||||
*
|
||||
*
|
||||
* @return int16_t* The output audio data
|
||||
*/
|
||||
int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
|
||||
int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);
|
||||
|
||||
/**
|
||||
* @brief Adjust the gain of input data
|
||||
*
|
||||
*
|
||||
* @warning the input data will be modified inplace.
|
||||
*
|
||||
*
|
||||
* @param in_data The input audio data
|
||||
* @param in_frame_size Input data frame size of input
|
||||
* @param channel_num The channel number of input data, which is same as output data
|
||||
@ -243,35 +247,35 @@ int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
|
||||
* @param out_frame_size Onput data frame size of input
|
||||
*
|
||||
*/
|
||||
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size);
|
||||
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);
|
||||
|
||||
/**
|
||||
* @brief Copy the afe config
|
||||
*
|
||||
*
|
||||
* @param dst_config The destination afe config
|
||||
* @param src_config The source afe config
|
||||
*
|
||||
*
|
||||
* @return The destination afe config
|
||||
*/
|
||||
afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
|
||||
afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
|
||||
|
||||
/**
|
||||
* @brief Print the afe config
|
||||
*
|
||||
*
|
||||
* @param afe_config The afe config
|
||||
*/
|
||||
void afe_config_print(const afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Allocate afe config
|
||||
*
|
||||
*
|
||||
* @return The afe config pointer
|
||||
*/
|
||||
afe_config_t *afe_config_alloc();
|
||||
|
||||
/**
|
||||
* @brief Free afe config
|
||||
*
|
||||
*
|
||||
* @param afe_config The afe config pointer
|
||||
*/
|
||||
void afe_config_free(afe_config_t *afe_config);
|
||||
|
||||
@ -1,62 +1,61 @@
|
||||
#pragma once
|
||||
#include "esp_afe_config.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdint.h"
|
||||
#include "stdlib.h"
|
||||
#include "stdbool.h"
|
||||
#include "esp_afe_config.h"
|
||||
#include "freertos/FreeRTOS.h"
|
||||
#include "freertos/task.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//AFE: Audio Front-End
|
||||
//SR: Speech Recognition
|
||||
//afe_sr/AFE_SR: the audio front-end for speech recognition
|
||||
// AFE: Audio Front-End
|
||||
// SR: Speech Recognition
|
||||
// afe_sr/AFE_SR: the audio front-end for speech recognition
|
||||
|
||||
//Opaque AFE_SR data container
|
||||
// Opaque AFE_SR data container
|
||||
typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* @brief The state of vad
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
|
||||
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
|
||||
typedef enum {
|
||||
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
|
||||
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
|
||||
} afe_vad_state_t;
|
||||
|
||||
/**
|
||||
* @brief The result of fetch function
|
||||
*/
|
||||
typedef struct afe_fetch_result_t
|
||||
{
|
||||
int16_t *data; // the target channel data of audio.
|
||||
int data_size; // the size of data. The unit is byte.
|
||||
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
|
||||
int vad_cache_size; // the size of vad_cache. The unit is byte.
|
||||
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc).
|
||||
// if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length.
|
||||
wakenet_state_t wakeup_state; // the value is wakenet_state_t
|
||||
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
|
||||
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
|
||||
vad_state_t vad_state; // the value is afe_vad_state_t
|
||||
int trigger_channel_id; // the channel index of output
|
||||
int wake_word_length; // the length of wake word. The unit is the number of samples.
|
||||
int ret_value; // the return state of fetch function
|
||||
int16_t *raw_data; // the multi-channel output data of audio.
|
||||
int raw_data_channels; // the channel number of raw data
|
||||
void* reserved; // reserved for future use
|
||||
typedef struct afe_fetch_result_t {
|
||||
int16_t *data; // the target channel data of audio.
|
||||
int data_size; // the size of data. The unit is byte.
|
||||
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
|
||||
// audio that was truncated.
|
||||
int vad_cache_size; // the size of vad_cache. The unit is byte.
|
||||
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
|
||||
// (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
|
||||
// wakenet(about 1.5s), otherwise is the frame length.
|
||||
wakenet_state_t wakeup_state; // the value is wakenet_state_t
|
||||
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
|
||||
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
|
||||
// start from 1.
|
||||
vad_state_t vad_state; // the value is afe_vad_state_t
|
||||
int trigger_channel_id; // the channel index of output
|
||||
int wake_word_length; // the length of wake word. The unit is the number of samples.
|
||||
int ret_value; // the return state of fetch function
|
||||
int16_t *raw_data; // the multi-channel output data of audio.
|
||||
int raw_data_channels; // the channel number of raw data
|
||||
void *reserved; // reserved for future use
|
||||
} afe_fetch_result_t;
|
||||
|
||||
/**
|
||||
* @brief Function to initialze a AFE_SR instance
|
||||
*
|
||||
*
|
||||
* @param afe_config The config of AFE_SR
|
||||
* @returns Handle to the AFE_SR data
|
||||
*/
|
||||
typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
|
||||
typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of each channel samples per frame that need to be passed to the function
|
||||
@ -71,7 +70,7 @@ typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the channel number
|
||||
*
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of total channels
|
||||
*/
|
||||
@ -92,12 +91,12 @@ typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
|
||||
* The last channel is reference signal if it has reference data.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
*
|
||||
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
|
||||
*
|
||||
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
|
||||
* `get_feed_chunksize`.
|
||||
* @return The size of input
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in);
|
||||
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);
|
||||
|
||||
/**
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR
|
||||
@ -106,9 +105,10 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t*
|
||||
* Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
|
||||
* audio can be queried by the `get_fetch_chunksize`.)
|
||||
*/
|
||||
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
|
||||
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
|
||||
@ -117,9 +117,10 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *af
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result.
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
|
||||
* audio can be queried by the `get_fetch_chunksize`.)
|
||||
*/
|
||||
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
|
||||
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
|
||||
|
||||
/**
|
||||
* @brief reset ringbuf of AFE.
|
||||
@ -130,14 +131,14 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr
|
||||
typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
|
||||
* @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient
|
||||
* when wakenet has been initialized. It's only support wakenet 1 now.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param wakenet_word The wakenet word, should be DEFAULT_WAKE_WORD or EXTRA_WAKE_WORD
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name);
|
||||
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name);
|
||||
|
||||
/**
|
||||
* @brief Enable VAD algorithm.
|
||||
@ -179,7 +180,6 @@ typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
|
||||
*/
|
||||
typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a AFE_SR.
|
||||
*/
|
||||
@ -191,11 +191,11 @@ typedef struct {
|
||||
esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
|
||||
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
|
||||
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_wakenet;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_wakenet;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_aec;
|
||||
@ -212,16 +212,14 @@ typedef struct {
|
||||
esp_afe_sr_iface_op_destroy_t destroy;
|
||||
} esp_afe_sr_iface_t;
|
||||
|
||||
|
||||
// struct is used to store the AFE handle and data for the AFE task
|
||||
typedef struct
|
||||
{
|
||||
typedef struct {
|
||||
esp_afe_sr_data_t *afe_data;
|
||||
esp_afe_sr_iface_t *afe_handle;
|
||||
TaskHandle_t feed_task;
|
||||
TaskHandle_t fetch_task;
|
||||
}afe_task_into_t;
|
||||
TaskHandle_t feed_task;
|
||||
TaskHandle_t fetch_task;
|
||||
} afe_task_into_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
@ -10,4 +10,4 @@ esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
89
include/esp32s3/esp_mfcc_iface.h
Normal file
89
include/esp32s3/esp_mfcc_iface.h
Normal file
@ -0,0 +1,89 @@
|
||||
#pragma once
|
||||
#include <stdint.h>
|
||||
#include "esp_speech_features.h"
|
||||
|
||||
/*
|
||||
This describes an interface for a MFCC runner, that is, some kind of implementation that can be
|
||||
fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so
|
||||
multiple implementations can be used.
|
||||
*/
|
||||
|
||||
|
||||
typedef struct esp_mfcc_data_t esp_mfcc_data_t;
|
||||
|
||||
|
||||
//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please
|
||||
//refer to its documentation for details.
|
||||
typedef struct {
|
||||
int winstep_ms; // The step between successive windows in ms. (10)
|
||||
int winlen_ms; // The length of the analysis window in ms. (25)
|
||||
int nch; // The number of input channel
|
||||
int numcep; // The number of cepstrum to return
|
||||
int nfilter; // The number of filters in the filterbank
|
||||
int nfft; // The FFT size
|
||||
int samp_freq; // The sample-rate of the signal.
|
||||
int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0)
|
||||
int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
|
||||
float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
|
||||
char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey"
|
||||
bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
|
||||
bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum
|
||||
int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
|
||||
float log_epsilon; // log epsilon. (e.g. 1e-7)
|
||||
bool psram_first; // Alloc memory from PSRAM first
|
||||
bool remove_dc_offset; // Whether to subtract mean of wave before FFT
|
||||
} esp_mfcc_opts_t;
|
||||
|
||||
|
||||
/**
|
||||
* @brief Un-initialize and free a mfcc runner
|
||||
*
|
||||
* Function to free a previously allocated mfcc runner.
|
||||
*
|
||||
* @param r Runner object to destroy
|
||||
*/
|
||||
typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
|
||||
|
||||
/**
|
||||
* @brief Initialize parameters for a mfcc runner.
|
||||
*
|
||||
* After creation, a mfcc runner needs to be initialized first; this is usually done
|
||||
* in the initialization routine of a speech recognition algorithm. This provides
|
||||
* a pointer to do this for a specific mfcc runner.
|
||||
*
|
||||
* @param opt Options for the mfcc process
|
||||
* @return True if success, false on error.
|
||||
*/
|
||||
typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
|
||||
|
||||
/**
|
||||
* @brief Run a mfcc iteration on frame by frame
|
||||
*
|
||||
* This will take a set of samples and return a ceptrum. Note that this may be pipelined:
|
||||
* an initial call to this function may return NULL and subsequent calls may return the
|
||||
* cepstrum of previous calls.
|
||||
*
|
||||
* @param r The mfcc runner
|
||||
* @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000).
|
||||
* @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function
|
||||
* when done with this buffer. Note that some implementations require the buffer to be freed before another call
|
||||
* to this function is done.
|
||||
*/
|
||||
typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
|
||||
|
||||
/**
|
||||
* @brief Clean all state of mfcc handle
|
||||
*
|
||||
* @param r The mfcc runner
|
||||
*/
|
||||
typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r);
|
||||
|
||||
/**
|
||||
* @brief Operations possible on a mfcc runner
|
||||
*/
|
||||
typedef struct {
|
||||
esp_mfcc_op_destroy_t destroy;
|
||||
esp_mfcc_op_create_t create;
|
||||
esp_mfcc_op_run_step_t run_step;
|
||||
esp_mfcc_op_clean_t clean;
|
||||
} esp_mfcc_iface_t;
|
||||
40
include/esp32s3/esp_mfcc_models.h
Normal file
40
include/esp32s3/esp_mfcc_models.h
Normal file
@ -0,0 +1,40 @@
|
||||
#pragma once
|
||||
#include "esp_mfcc_iface.h"
|
||||
|
||||
|
||||
extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle
|
||||
|
||||
|
||||
/**
|
||||
* @brief Return basic opts used in wakenet9 & multinet5
|
||||
**/
|
||||
esp_mfcc_opts_t *get_mfcc_opts_wn9();
|
||||
|
||||
/**
|
||||
* @brief Return basic opts for default kaldifeat
|
||||
*
|
||||
opts->psram_first = true;
|
||||
opts->use_power = true;
|
||||
opts->use_log_fbank = 2; // log(max(x, log_epsilon))
|
||||
opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps
|
||||
opts->win_type = "povey";
|
||||
opts->low_freq = 20;
|
||||
opts->high_freq = 7600;
|
||||
opts->samp_freq = 16000;
|
||||
opts->nch = 1;
|
||||
opts->nfft = 512;
|
||||
opts->nfilter = 80;
|
||||
opts->numcep = 80;
|
||||
opts->preemph = 0.97;
|
||||
opts->append_energy = false;
|
||||
opts->winlen_ms = 25;
|
||||
opts->winstep_ms = 10;
|
||||
opts->remove_dc_offset = true;
|
||||
*
|
||||
**/
|
||||
esp_mfcc_opts_t *get_mfcc_opts_kaldi();
|
||||
|
||||
/**
|
||||
* @brief Print mfcc opts
|
||||
**/
|
||||
void print_mfcc_opts(esp_mfcc_opts_t *opts);
|
||||
64
include/esp32s3/esp_speech_features.h
Normal file
64
include/esp32s3/esp_speech_features.h
Normal file
@ -0,0 +1,64 @@
|
||||
#pragma once
|
||||
#include "c_speech_features_config.h"
|
||||
#include "stdlib.h"
|
||||
#include <assert.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
#ifndef M_2PI
|
||||
#define M_2PI 6.283185307179586476925286766559005
|
||||
#endif
|
||||
|
||||
typedef struct
|
||||
{
|
||||
float *coeff;
|
||||
int *bank_pos;
|
||||
int nfilter;
|
||||
} esp_mel_filter_t;
|
||||
|
||||
float* esp_mfcc_malloc(size_t size, bool from_psram);
|
||||
|
||||
void esp_mfcc_free(void *ptr);
|
||||
|
||||
/**
|
||||
* @brief Initialize FFT table
|
||||
* @warning For ESP-PLATFORM, use esp-dsp fft
|
||||
* For Other platform, use kiss fft
|
||||
*
|
||||
* @param nfft The input samples number
|
||||
* @return fft-table
|
||||
**/
|
||||
void* esp_fft_init(int nfft);
|
||||
|
||||
/**
|
||||
* @brief Free FFT table
|
||||
* @warning For ESP-PLATFORM, use esp-dsp fft
|
||||
* For Other platform, use kiss fft
|
||||
*
|
||||
* @param fft_table The fft table initialized by esp_fft_init
|
||||
* @param nfft The input samples number
|
||||
* @return fft-table
|
||||
**/
|
||||
void esp_fft_deinit(void *fft_table, int nfft);
|
||||
|
||||
/**
|
||||
* @brief Initial window function
|
||||
* Currently support hanning, hamming, sine, povey, rectangular,
|
||||
* wn9(512-hanning to get wakenet9& multinet5 compatible)
|
||||
**/
|
||||
float *esp_win_func_init(char *win_type, float* window_data, int frame_length);
|
||||
|
||||
float* esp_fftr(float* x, int nfft, void *fft_table);
|
||||
|
||||
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
|
||||
|
||||
void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
|
||||
|
||||
float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);
|
||||
|
||||
esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq,
|
||||
bool from_psram);
|
||||
|
||||
void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);
|
||||
|
||||
float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank,
|
||||
float epsilon);
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -4,6 +4,7 @@ set(srcs
|
||||
"test_wakenet.cpp"
|
||||
"test_multinet.cpp"
|
||||
"test_afe.cpp"
|
||||
"test_mfcc.cpp"
|
||||
)
|
||||
|
||||
idf_component_register(SRCS ${srcs}
|
||||
|
||||
63
test_apps/esp-sr/main/test_mfcc.cpp
Normal file
63
test_apps/esp-sr/main/test_mfcc.cpp
Normal file
@ -0,0 +1,63 @@
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
#include "string.h"
|
||||
#include <limits.h>
|
||||
#include "unity.h"
|
||||
#include "esp_log.h"
|
||||
#include "esp_heap_caps.h"
|
||||
#include "esp_mfcc_iface.h"
|
||||
#include "esp_mfcc_models.h"
|
||||
#include "alexa.h"
|
||||
|
||||
esp_mfcc_opts_t *get_fbank_opts_kaldi(int dim)
|
||||
{
|
||||
esp_mfcc_opts_t *opts = (esp_mfcc_opts_t*)malloc(sizeof(esp_mfcc_opts_t));
|
||||
opts->psram_first = true;
|
||||
opts->use_power = true;
|
||||
opts->use_log_fbank = 0; // log(max(x, log_epsilon))
|
||||
opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps
|
||||
opts->win_type = const_cast<char*>("hanning"); // remove [-Wwrite-strings] warning
|
||||
opts->low_freq = 20;
|
||||
opts->high_freq = 7600;
|
||||
opts->samp_freq = 16000;
|
||||
opts->nch = 1;
|
||||
opts->nfft = 512;
|
||||
opts->nfilter = 80;
|
||||
opts->numcep = dim;
|
||||
opts->preemph = 0.97;
|
||||
opts->append_energy = false;
|
||||
opts->winlen_ms = 25;
|
||||
opts->winstep_ms = 10;
|
||||
opts->remove_dc_offset = true;
|
||||
|
||||
return opts;
|
||||
}
|
||||
|
||||
|
||||
TEST_CASE("test speech features", "[fbank]")
|
||||
{
|
||||
int16_t *buffer = (int16_t *) malloc(512 * sizeof(int16_t));
|
||||
const esp_mfcc_iface_t *fbank_handle = &esp_fbank_f32;
|
||||
float* fbank_out = NULL;
|
||||
|
||||
int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
|
||||
// MFCC init
|
||||
int out_dim = 80;
|
||||
esp_mfcc_data_t *fbank_model = fbank_handle->create(get_fbank_opts_kaldi(out_dim));
|
||||
memcpy(buffer, alexa, 512 * sizeof(int16_t));
|
||||
fbank_out = fbank_handle->run_step(fbank_model, buffer, 0);
|
||||
fbank_handle->destroy(fbank_model);
|
||||
int first_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
|
||||
for (int i = 0; i < out_dim; i++) {
|
||||
printf("%f ", fbank_out[i]);
|
||||
}
|
||||
|
||||
fbank_model = fbank_handle->create(get_fbank_opts_kaldi(out_dim));
|
||||
memcpy(buffer, alexa, 512 * sizeof(int16_t));
|
||||
fbank_out = fbank_handle->run_step(fbank_model, buffer, 0);
|
||||
fbank_handle->destroy(fbank_model);
|
||||
int second_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
|
||||
|
||||
TEST_ASSERT_EQUAL(true, start_size - first_end_size < 100); // there are some memory leak in system
|
||||
TEST_ASSERT_EQUAL(true, first_end_size == second_end_size);
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user