diff --git a/include/esp32/c_speech_features_config.h b/include/esp32/c_speech_features_config.h new file mode 100644 index 0000000..e21e020 --- /dev/null +++ b/include/esp32/c_speech_features_config.h @@ -0,0 +1,29 @@ +#pragma once +#include +#include + +/* #undef ENABLE_DOUBLE */ + +#ifdef ENABLE_DOUBLE +# define csf_float double +# define csf_ceil ceil +# define csf_floor floor +# define csf_sin sin +# define csf_log log +# define csf_log10 log10 +# define csf_pow pow +# define csf_sqrt sqrt +# define csf_abs fabs +# define csf_float_min DBL_MIN +#else +# define csf_float float +# define csf_ceil ceilf +# define csf_floor floorf +# define csf_sin sinf +# define csf_log logf +# define csf_log10 log10f +# define csf_pow powf +# define csf_sqrt sqrtf +# define csf_abs fabsf +# define csf_float_min FLT_MIN +#endif diff --git a/include/esp32/esp_afe_config.h b/include/esp32/esp_afe_config.h index 694caa2..16906bd 100644 --- a/include/esp32/esp_afe_config.h +++ b/include/esp32/esp_afe_config.h @@ -1,241 +1,245 @@ #pragma once -#include "stdint.h" -#include "stdbool.h" -#include "stdlib.h" -#include "esp_wn_iface.h" -#include "esp_wn_models.h" -#include "esp_vad.h" #include "esp_aec.h" #include "esp_agc.h" -#include "model_path.h" -#include "esp_vadn_models.h" #include "esp_nsn_models.h" +#include "esp_vad.h" +#include "esp_vadn_models.h" +#include "esp_wn_iface.h" +#include "esp_wn_models.h" +#include "model_path.h" +#include "stdbool.h" +#include "stdint.h" +#include "stdlib.h" #ifdef __cplusplus extern "C" { #endif -//AFE: Audio Front-End -//SR: Speech Recognition -//VC: Voice Communication +// AFE: Audio Front-End +// SR: Speech Recognition +// VC: Voice Communication -//Set AFE_SR mode +// Set AFE_SR mode typedef enum { - SR_MODE_LOW_COST = 0, //Deprecated, please use afe_mode_t, AFE mode: low cost mode - SR_MODE_HIGH_PERF = 1, //Deprecated, please use afe_mode_t, AFE mode: high performance mode + SR_MODE_LOW_COST = 0, // Deprecated, please use afe_mode_t, AFE mode: low cost mode + SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode } afe_sr_mode_t; -//Set AFE mode +// Set AFE mode typedef enum { - AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode - AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode + AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode + AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode } afe_mode_t; -//Set AFE type +// Set AFE type typedef enum { - AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression - AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression + AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression + AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression } afe_type_t; typedef enum { - AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram - AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance - AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram + AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram + AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance + AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram } afe_memory_alloc_mode_t; typedef enum { - AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB - AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB - AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB - AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain + AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB + AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB + AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB + AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain } afe_mn_peak_agc_mode_t; typedef struct { - int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel - int mic_num; // microphone channel number - uint8_t* mic_ids; // microphone channel indices - int ref_num; // playback reference channel number - uint8_t* ref_ids; // playback reference channel indices - int sample_rate; // sample rate of audio + int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel + int mic_num; // microphone channel number + uint8_t *mic_ids; // microphone channel indices + int ref_num; // playback reference channel number + uint8_t *ref_ids; // playback reference channel indices + int sample_rate; // sample rate of audio } afe_pcm_config_t; typedef enum { - AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC" - AFE_NS_MODE_NET = 1, // please use model name of NSNET + AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC" + AFE_NS_MODE_NET = 1, // please use model name of NSNET } afe_ns_mode_t; typedef enum { - AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC - AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated + AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC + AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated } afe_agc_mode_t; /** * @brief Function to get the debug audio data * - * @param data The debug audio data which don't be modify. It should be copied away as soon as possible that avoid blocking for too long. + * @param data The debug audio data which don't be modify. It should be copied away as soon as possible that + * avoid blocking for too long. * @param data_size The number of bytes of data. * @returns */ -typedef void (*afe_debug_hook_callback_t)(const int16_t* data, int data_size); +typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size); typedef enum { - AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task - AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task + AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task + AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task AFE_DEBUG_HOOK_MAX = 2 } afe_debug_hook_type_t; typedef struct { - afe_debug_hook_type_t hook_type; // debug type of hook - afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data + afe_debug_hook_type_t hook_type; // debug type of hook + afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data } afe_debug_hook_t; typedef struct { /********** AEC(Acoustic Echo Cancellation) **********/ - bool aec_init; // Whether to init aec - aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF - int aec_filter_length; // The filter length of aec + bool aec_init; // Whether to init aec + aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF + int aec_filter_length; // The filter length of aec /********** SE(Speech Enhancement, microphone array processing) **********/ - bool se_init; // Whether to init se + bool se_init; // Whether to init se /********** NS(Noise Suppression) **********/ - bool ns_init; // Whether to init ns - char *ns_model_name; // Model name of ns - afe_ns_mode_t afe_ns_mode; // Model mode of ns - + bool ns_init; // Whether to init ns + char *ns_model_name; // Model name of ns + afe_ns_mode_t afe_ns_mode; // Model mode of ns + /********** VAD(Voice Activity Detection) **********/ - bool vad_init; // Whether to init vad - vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4 - char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used. - int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms - int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms - bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false - bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false + bool vad_init; // Whether to init vad + vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4 + char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used. + int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms + int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: + // 1000 ms + bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false + bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false /********** WakeNet(Wake Word Engine) **********/ bool wakenet_init; - char *wakenet_model_name; // The model name of wakenet 1 - char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2 - det_mode_t wakenet_mode; // The mode of wakenet + char *wakenet_model_name; // The model name of wakenet 1 + char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2 + det_mode_t wakenet_mode; // The mode of wakenet /********** AGC(Automatic Gain Control) **********/ - bool agc_init; // Whether to init agc - afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain. - int agc_compression_gain_db; // Compression gain in dB (default 9) - int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3) + bool agc_init; // Whether to init agc + afe_agc_mode_t + agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain. + int agc_compression_gain_db; // Compression gain in dB (default 9) + int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3) /********** General AFE(Audio Front End) parameter **********/ - afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function. - afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF - afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF - int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function. - int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function. - int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer. - afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM - float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude. + afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function. + afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function. + int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function. + int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer. + afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM + float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts + // directly on the output amplitude: out_linear_gain * amplitude. bool debug_init; - bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone - // otherwise, select channel number by wakenet + bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone + // otherwise, select channel number by wakenet } afe_config_t; /** - * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format. - * You can manually fine-tune it after creating the configuration - * - * The input format: + * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based + * on the chip target and input format. You can manually fine-tune it after creating the configuration + * + * The input format: * M to represent the microphone channel * R to represent the playback reference channel * N to represent an unknown or unused channel - * - * For example, input_format="MMNR" indicates that the input data consists of four channels, + * + * For example, input_format="MMNR" indicates that the input data consists of four channels, * which are the microphone channel, the microphone channel, an unused channel, and the playback channel - * + * * @param input_format The input format * @param models Models from partition, which is configured by Kconfig * @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC * @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF - * + * * @return afe_config_t* The default config of afe */ afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode); /** * @brief Check AFE configuration and make sure it is correct. - * - * @warning If there is a configuration conflict, this function will modify some parameters. + * + * @warning If there is a configuration conflict, this function will modify some parameters. * The guiding behind these modifications is to maintain the highest performance of the output audio and results. * And remove the conflict between different algorithms. - * + * * For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm. * If SE(BSS) algorithm is deactivated, will only use the first microphone channel. - * + * * @param afe_config Input AFE config - * + * * @return afe_config_t* The modified AFE config */ afe_config_t *afe_config_check(afe_config_t *afe_config); /** * @brief Parse input format - * + * * @param input_format The input format, same with afe_config_init() function * @param pcm_config The pcm config - * + * * @return true if the input format is parsed successfully, otherwise false */ -bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config); +bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config); /** * @brief Parse I2S input data - * + * * @param data The input multi channel data * @param frame_size The frame size of input, it is also the size of single channel data * @param mic_data The output microphone data * @param ref_data The output playback reference data * @param pcm_config The pcm config - * + * */ -void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config); +void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config); /** * @brief Parse input data, from interleaved arrangement to contiguous arrangement - * + * * @param data The input multi channel data * @param frame_size The frame size of input, it is also the size of single channel data * @param channel_num The channel number of data * @param out_data The output data - * + * */ -void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data); +void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data); /** * @brief Format input data, from contiguous arrangement to interleaved arrangement - * + * * @param data The input multi channel data * @param frame_size The frame size of input, it is also the size of single channel data * @param channel_num The channel number of data * @param out_data The output data - * + * */ -void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data); +void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data); /** * @brief Adjust the gain of input data - * + * * @warning the input data will be modified inplace. - * + * * @param data The input audio data * @param frame_size The frame size of input, it is also the size of single channel data * @param factor The gain factor - * + * * @return int16_t* The output audio data */ -int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor); +int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor); /** * @brief Adjust the gain of input data - * + * * @warning the input data will be modified inplace. - * + * * @param in_data The input audio data * @param in_frame_size Input data frame size of input * @param channel_num The channel number of input data, which is same as output data @@ -243,35 +247,35 @@ int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor); * @param out_frame_size Onput data frame size of input * */ -void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size); +void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size); /** * @brief Copy the afe config - * + * * @param dst_config The destination afe config * @param src_config The source afe config - * + * * @return The destination afe config */ -afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config); +afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config); /** * @brief Print the afe config - * + * * @param afe_config The afe config */ void afe_config_print(const afe_config_t *afe_config); /** * @brief Allocate afe config - * + * * @return The afe config pointer */ afe_config_t *afe_config_alloc(); /** * @brief Free afe config - * + * * @param afe_config The afe config pointer */ void afe_config_free(afe_config_t *afe_config); diff --git a/include/esp32/esp_afe_sr_iface.h b/include/esp32/esp_afe_sr_iface.h index f434c3e..580eed9 100644 --- a/include/esp32/esp_afe_sr_iface.h +++ b/include/esp32/esp_afe_sr_iface.h @@ -1,62 +1,61 @@ #pragma once +#include "esp_afe_config.h" +#include "stdbool.h" #include "stdint.h" #include "stdlib.h" -#include "stdbool.h" -#include "esp_afe_config.h" #include "freertos/FreeRTOS.h" #include "freertos/task.h" #ifdef __cplusplus extern "C" { #endif -//AFE: Audio Front-End -//SR: Speech Recognition -//afe_sr/AFE_SR: the audio front-end for speech recognition +// AFE: Audio Front-End +// SR: Speech Recognition +// afe_sr/AFE_SR: the audio front-end for speech recognition -//Opaque AFE_SR data container +// Opaque AFE_SR data container typedef struct esp_afe_sr_data_t esp_afe_sr_data_t; - - /** * @brief The state of vad */ -typedef enum -{ - AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence - AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech +typedef enum { + AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence + AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech } afe_vad_state_t; /** * @brief The result of fetch function */ -typedef struct afe_fetch_result_t -{ - int16_t *data; // the target channel data of audio. - int data_size; // the size of data. The unit is byte. - int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated. - int vad_cache_size; // the size of vad_cache. The unit is byte. - float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc). - // if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length. - wakenet_state_t wakeup_state; // the value is wakenet_state_t - int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1. - int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1. - vad_state_t vad_state; // the value is afe_vad_state_t - int trigger_channel_id; // the channel index of output - int wake_word_length; // the length of wake word. The unit is the number of samples. - int ret_value; // the return state of fetch function - int16_t *raw_data; // the multi-channel output data of audio. - int raw_data_channels; // the channel number of raw data - void* reserved; // reserved for future use +typedef struct afe_fetch_result_t { + int16_t *data; // the target channel data of audio. + int data_size; // the size of data. The unit is byte. + int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the + // audio that was truncated. + int vad_cache_size; // the size of vad_cache. The unit is byte. + float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. + // (note: invalid in vc). if enable wakenet, the window length is the receptive fields of + // wakenet(about 1.5s), otherwise is the frame length. + wakenet_state_t wakeup_state; // the value is wakenet_state_t + int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1. + int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index + // start from 1. + vad_state_t vad_state; // the value is afe_vad_state_t + int trigger_channel_id; // the channel index of output + int wake_word_length; // the length of wake word. The unit is the number of samples. + int ret_value; // the return state of fetch function + int16_t *raw_data; // the multi-channel output data of audio. + int raw_data_channels; // the channel number of raw data + void *reserved; // reserved for future use } afe_fetch_result_t; /** * @brief Function to initialze a AFE_SR instance - * + * * @param afe_config The config of AFE_SR * @returns Handle to the AFE_SR data */ -typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config); +typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config); /** * @brief Get the amount of each channel samples per frame that need to be passed to the function @@ -71,7 +70,7 @@ typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe); /** * @brief Get the channel number - * + * * @param afe The AFE_SR object to query * @return The amount of total channels */ @@ -92,12 +91,12 @@ typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe); * The last channel is reference signal if it has reference data. * * @param afe The AFE_SR object to query - * - * @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the + * + * @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the * `get_feed_chunksize`. * @return The size of input */ -typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in); +typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in); /** * @brief fetch enhanced samples of an audio stream from the AFE_SR @@ -106,9 +105,10 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* * Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`. * * @param afe The AFE_SR object to query - * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.) + * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output + * audio can be queried by the `get_fetch_chunksize`.) */ -typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe); +typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe); /** * @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch` @@ -117,9 +117,10 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *af * * @param afe The AFE_SR object to query * @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result. - * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.) + * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output + * audio can be queried by the `get_fetch_chunksize`.) */ -typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait); +typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait); /** * @brief reset ringbuf of AFE. @@ -130,14 +131,14 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe); /** - * @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient + * @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient * when wakenet has been initialized. It's only support wakenet 1 now. * * @param afe The AFE_SR object to query * @param wakenet_word The wakenet word, should be DEFAULT_WAKE_WORD or EXTRA_WAKE_WORD * @return -1: fail, 1: success */ -typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name); +typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name); /** * @brief Enable VAD algorithm. @@ -179,7 +180,6 @@ typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe); */ typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe); - /** * This structure contains the functions used to do operations on a AFE_SR. */ @@ -191,11 +191,11 @@ typedef struct { esp_afe_sr_iface_op_reset_buffer_t reset_buffer; esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize; esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize; - esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num + esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num; esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num; esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate; - esp_afe_sr_iface_op_set_wakenet_t set_wakenet; + esp_afe_sr_iface_op_set_wakenet_t set_wakenet; esp_afe_sr_iface_op_disable_func_t disable_wakenet; esp_afe_sr_iface_op_enable_func_t enable_wakenet; esp_afe_sr_iface_op_disable_func_t disable_aec; @@ -212,16 +212,14 @@ typedef struct { esp_afe_sr_iface_op_destroy_t destroy; } esp_afe_sr_iface_t; - // struct is used to store the AFE handle and data for the AFE task -typedef struct -{ +typedef struct { esp_afe_sr_data_t *afe_data; esp_afe_sr_iface_t *afe_handle; - TaskHandle_t feed_task; - TaskHandle_t fetch_task; -}afe_task_into_t; + TaskHandle_t feed_task; + TaskHandle_t fetch_task; +} afe_task_into_t; #ifdef __cplusplus } -#endif \ No newline at end of file +#endif diff --git a/include/esp32/esp_afe_sr_models.h b/include/esp32/esp_afe_sr_models.h index 05a08d3..5b37b42 100644 --- a/include/esp32/esp_afe_sr_models.h +++ b/include/esp32/esp_afe_sr_models.h @@ -10,4 +10,4 @@ esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config); #ifdef __cplusplus } -#endif \ No newline at end of file +#endif diff --git a/include/esp32/esp_mfcc_iface.h b/include/esp32/esp_mfcc_iface.h new file mode 100644 index 0000000..95e287b --- /dev/null +++ b/include/esp32/esp_mfcc_iface.h @@ -0,0 +1,89 @@ +#pragma once +#include +#include "esp_speech_features.h" + +/* +This describes an interface for a MFCC runner, that is, some kind of implementation that can be +fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so +multiple implementations can be used. +*/ + + +typedef struct esp_mfcc_data_t esp_mfcc_data_t; + + +//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please +//refer to its documentation for details. +typedef struct { + int winstep_ms; // The step between successive windows in ms. (10) + int winlen_ms; // The length of the analysis window in ms. (25) + int nch; // The number of input channel + int numcep; // The number of cepstrum to return + int nfilter; // The number of filters in the filterbank + int nfft; // The FFT size + int samp_freq; // The sample-rate of the signal. + int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0) + int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq + float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97) + char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey" + bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy + bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum + int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon)) + float log_epsilon; // log epsilon. (e.g. 1e-7) + bool psram_first; // Alloc memory from PSRAM first + bool remove_dc_offset; // Whether to subtract mean of wave before FFT +} esp_mfcc_opts_t; + + +/** + * @brief Un-initialize and free a mfcc runner + * + * Function to free a previously allocated mfcc runner. + * + * @param r Runner object to destroy + */ +typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r); + +/** + * @brief Initialize parameters for a mfcc runner. + * + * After creation, a mfcc runner needs to be initialized first; this is usually done + * in the initialization routine of a speech recognition algorithm. This provides + * a pointer to do this for a specific mfcc runner. + * + * @param opt Options for the mfcc process + * @return True if success, false on error. + */ +typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); + +/** + * @brief Run a mfcc iteration on frame by frame + * + * This will take a set of samples and return a ceptrum. Note that this may be pipelined: + * an initial call to this function may return NULL and subsequent calls may return the + * cepstrum of previous calls. + * + * @param r The mfcc runner + * @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000). + * @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function + * when done with this buffer. Note that some implementations require the buffer to be freed before another call + * to this function is done. + */ +typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch); + +/** + * @brief Clean all state of mfcc handle + * + * @param r The mfcc runner + */ +typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r); + +/** + * @brief Operations possible on a mfcc runner + */ +typedef struct { + esp_mfcc_op_destroy_t destroy; + esp_mfcc_op_create_t create; + esp_mfcc_op_run_step_t run_step; + esp_mfcc_op_clean_t clean; +} esp_mfcc_iface_t; diff --git a/include/esp32/esp_mfcc_models.h b/include/esp32/esp_mfcc_models.h new file mode 100644 index 0000000..f8e9119 --- /dev/null +++ b/include/esp32/esp_mfcc_models.h @@ -0,0 +1,40 @@ +#pragma once +#include "esp_mfcc_iface.h" + + +extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle + + +/** + * @brief Return basic opts used in wakenet9 & multinet5 + **/ +esp_mfcc_opts_t *get_mfcc_opts_wn9(); + +/** + * @brief Return basic opts for default kaldifeat + * + opts->psram_first = true; + opts->use_power = true; + opts->use_log_fbank = 2; // log(max(x, log_epsilon)) + opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps + opts->win_type = "povey"; + opts->low_freq = 20; + opts->high_freq = 7600; + opts->samp_freq = 16000; + opts->nch = 1; + opts->nfft = 512; + opts->nfilter = 80; + opts->numcep = 80; + opts->preemph = 0.97; + opts->append_energy = false; + opts->winlen_ms = 25; + opts->winstep_ms = 10; + opts->remove_dc_offset = true; + * + **/ +esp_mfcc_opts_t *get_mfcc_opts_kaldi(); + +/** + * @brief Print mfcc opts + **/ +void print_mfcc_opts(esp_mfcc_opts_t *opts); \ No newline at end of file diff --git a/include/esp32/esp_speech_features.h b/include/esp32/esp_speech_features.h new file mode 100644 index 0000000..3552f4a --- /dev/null +++ b/include/esp32/esp_speech_features.h @@ -0,0 +1,64 @@ +#pragma once +#include "c_speech_features_config.h" +#include "stdlib.h" +#include +#include + +#ifndef M_2PI +#define M_2PI 6.283185307179586476925286766559005 +#endif + +typedef struct +{ + float *coeff; + int *bank_pos; + int nfilter; +} esp_mel_filter_t; + +float* esp_mfcc_malloc(size_t size, bool from_psram); + +void esp_mfcc_free(void *ptr); + +/** + * @brief Initialize FFT table + * @warning For ESP-PLATFORM, use esp-dsp fft + * For Other platform, use kiss fft + * + * @param nfft The input samples number + * @return fft-table + **/ +void* esp_fft_init(int nfft); + +/** + * @brief Free FFT table + * @warning For ESP-PLATFORM, use esp-dsp fft + * For Other platform, use kiss fft + * + * @param fft_table The fft table initialized by esp_fft_init + * @param nfft The input samples number + * @return fft-table + **/ +void esp_fft_deinit(void *fft_table, int nfft); + +/** + * @brief Initial window function + * Currently support hanning, hamming, sine, povey, rectangular, + * wn9(512-hanning to get wakenet9& multinet5 compatible) + **/ +float *esp_win_func_init(char *win_type, float* window_data, int frame_length); + +float* esp_fftr(float* x, int nfft, void *fft_table); + +float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table); + +void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc); + +float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last); + +esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, + bool from_psram); + +void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter); + +float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, + float epsilon); diff --git a/include/esp32p4/c_speech_features_config.h b/include/esp32p4/c_speech_features_config.h new file mode 100644 index 0000000..e21e020 --- /dev/null +++ b/include/esp32p4/c_speech_features_config.h @@ -0,0 +1,29 @@ +#pragma once +#include +#include + +/* #undef ENABLE_DOUBLE */ + +#ifdef ENABLE_DOUBLE +# define csf_float double +# define csf_ceil ceil +# define csf_floor floor +# define csf_sin sin +# define csf_log log +# define csf_log10 log10 +# define csf_pow pow +# define csf_sqrt sqrt +# define csf_abs fabs +# define csf_float_min DBL_MIN +#else +# define csf_float float +# define csf_ceil ceilf +# define csf_floor floorf +# define csf_sin sinf +# define csf_log logf +# define csf_log10 log10f +# define csf_pow powf +# define csf_sqrt sqrtf +# define csf_abs fabsf +# define csf_float_min FLT_MIN +#endif diff --git a/include/esp32p4/esp_afe_config.h b/include/esp32p4/esp_afe_config.h index 694caa2..16906bd 100644 --- a/include/esp32p4/esp_afe_config.h +++ b/include/esp32p4/esp_afe_config.h @@ -1,241 +1,245 @@ #pragma once -#include "stdint.h" -#include "stdbool.h" -#include "stdlib.h" -#include "esp_wn_iface.h" -#include "esp_wn_models.h" -#include "esp_vad.h" #include "esp_aec.h" #include "esp_agc.h" -#include "model_path.h" -#include "esp_vadn_models.h" #include "esp_nsn_models.h" +#include "esp_vad.h" +#include "esp_vadn_models.h" +#include "esp_wn_iface.h" +#include "esp_wn_models.h" +#include "model_path.h" +#include "stdbool.h" +#include "stdint.h" +#include "stdlib.h" #ifdef __cplusplus extern "C" { #endif -//AFE: Audio Front-End -//SR: Speech Recognition -//VC: Voice Communication +// AFE: Audio Front-End +// SR: Speech Recognition +// VC: Voice Communication -//Set AFE_SR mode +// Set AFE_SR mode typedef enum { - SR_MODE_LOW_COST = 0, //Deprecated, please use afe_mode_t, AFE mode: low cost mode - SR_MODE_HIGH_PERF = 1, //Deprecated, please use afe_mode_t, AFE mode: high performance mode + SR_MODE_LOW_COST = 0, // Deprecated, please use afe_mode_t, AFE mode: low cost mode + SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode } afe_sr_mode_t; -//Set AFE mode +// Set AFE mode typedef enum { - AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode - AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode + AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode + AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode } afe_mode_t; -//Set AFE type +// Set AFE type typedef enum { - AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression - AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression + AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression + AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression } afe_type_t; typedef enum { - AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram - AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance - AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram + AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram + AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance + AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram } afe_memory_alloc_mode_t; typedef enum { - AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB - AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB - AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB - AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain + AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB + AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB + AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB + AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain } afe_mn_peak_agc_mode_t; typedef struct { - int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel - int mic_num; // microphone channel number - uint8_t* mic_ids; // microphone channel indices - int ref_num; // playback reference channel number - uint8_t* ref_ids; // playback reference channel indices - int sample_rate; // sample rate of audio + int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel + int mic_num; // microphone channel number + uint8_t *mic_ids; // microphone channel indices + int ref_num; // playback reference channel number + uint8_t *ref_ids; // playback reference channel indices + int sample_rate; // sample rate of audio } afe_pcm_config_t; typedef enum { - AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC" - AFE_NS_MODE_NET = 1, // please use model name of NSNET + AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC" + AFE_NS_MODE_NET = 1, // please use model name of NSNET } afe_ns_mode_t; typedef enum { - AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC - AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated + AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC + AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated } afe_agc_mode_t; /** * @brief Function to get the debug audio data * - * @param data The debug audio data which don't be modify. It should be copied away as soon as possible that avoid blocking for too long. + * @param data The debug audio data which don't be modify. It should be copied away as soon as possible that + * avoid blocking for too long. * @param data_size The number of bytes of data. * @returns */ -typedef void (*afe_debug_hook_callback_t)(const int16_t* data, int data_size); +typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size); typedef enum { - AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task - AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task + AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task + AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task AFE_DEBUG_HOOK_MAX = 2 } afe_debug_hook_type_t; typedef struct { - afe_debug_hook_type_t hook_type; // debug type of hook - afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data + afe_debug_hook_type_t hook_type; // debug type of hook + afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data } afe_debug_hook_t; typedef struct { /********** AEC(Acoustic Echo Cancellation) **********/ - bool aec_init; // Whether to init aec - aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF - int aec_filter_length; // The filter length of aec + bool aec_init; // Whether to init aec + aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF + int aec_filter_length; // The filter length of aec /********** SE(Speech Enhancement, microphone array processing) **********/ - bool se_init; // Whether to init se + bool se_init; // Whether to init se /********** NS(Noise Suppression) **********/ - bool ns_init; // Whether to init ns - char *ns_model_name; // Model name of ns - afe_ns_mode_t afe_ns_mode; // Model mode of ns - + bool ns_init; // Whether to init ns + char *ns_model_name; // Model name of ns + afe_ns_mode_t afe_ns_mode; // Model mode of ns + /********** VAD(Voice Activity Detection) **********/ - bool vad_init; // Whether to init vad - vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4 - char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used. - int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms - int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms - bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false - bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false + bool vad_init; // Whether to init vad + vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4 + char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used. + int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms + int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: + // 1000 ms + bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false + bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false /********** WakeNet(Wake Word Engine) **********/ bool wakenet_init; - char *wakenet_model_name; // The model name of wakenet 1 - char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2 - det_mode_t wakenet_mode; // The mode of wakenet + char *wakenet_model_name; // The model name of wakenet 1 + char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2 + det_mode_t wakenet_mode; // The mode of wakenet /********** AGC(Automatic Gain Control) **********/ - bool agc_init; // Whether to init agc - afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain. - int agc_compression_gain_db; // Compression gain in dB (default 9) - int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3) + bool agc_init; // Whether to init agc + afe_agc_mode_t + agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain. + int agc_compression_gain_db; // Compression gain in dB (default 9) + int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3) /********** General AFE(Audio Front End) parameter **********/ - afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function. - afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF - afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF - int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function. - int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function. - int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer. - afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM - float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude. + afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function. + afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function. + int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function. + int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer. + afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM + float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts + // directly on the output amplitude: out_linear_gain * amplitude. bool debug_init; - bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone - // otherwise, select channel number by wakenet + bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone + // otherwise, select channel number by wakenet } afe_config_t; /** - * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format. - * You can manually fine-tune it after creating the configuration - * - * The input format: + * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based + * on the chip target and input format. You can manually fine-tune it after creating the configuration + * + * The input format: * M to represent the microphone channel * R to represent the playback reference channel * N to represent an unknown or unused channel - * - * For example, input_format="MMNR" indicates that the input data consists of four channels, + * + * For example, input_format="MMNR" indicates that the input data consists of four channels, * which are the microphone channel, the microphone channel, an unused channel, and the playback channel - * + * * @param input_format The input format * @param models Models from partition, which is configured by Kconfig * @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC * @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF - * + * * @return afe_config_t* The default config of afe */ afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode); /** * @brief Check AFE configuration and make sure it is correct. - * - * @warning If there is a configuration conflict, this function will modify some parameters. + * + * @warning If there is a configuration conflict, this function will modify some parameters. * The guiding behind these modifications is to maintain the highest performance of the output audio and results. * And remove the conflict between different algorithms. - * + * * For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm. * If SE(BSS) algorithm is deactivated, will only use the first microphone channel. - * + * * @param afe_config Input AFE config - * + * * @return afe_config_t* The modified AFE config */ afe_config_t *afe_config_check(afe_config_t *afe_config); /** * @brief Parse input format - * + * * @param input_format The input format, same with afe_config_init() function * @param pcm_config The pcm config - * + * * @return true if the input format is parsed successfully, otherwise false */ -bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config); +bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config); /** * @brief Parse I2S input data - * + * * @param data The input multi channel data * @param frame_size The frame size of input, it is also the size of single channel data * @param mic_data The output microphone data * @param ref_data The output playback reference data * @param pcm_config The pcm config - * + * */ -void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config); +void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config); /** * @brief Parse input data, from interleaved arrangement to contiguous arrangement - * + * * @param data The input multi channel data * @param frame_size The frame size of input, it is also the size of single channel data * @param channel_num The channel number of data * @param out_data The output data - * + * */ -void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data); +void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data); /** * @brief Format input data, from contiguous arrangement to interleaved arrangement - * + * * @param data The input multi channel data * @param frame_size The frame size of input, it is also the size of single channel data * @param channel_num The channel number of data * @param out_data The output data - * + * */ -void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data); +void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data); /** * @brief Adjust the gain of input data - * + * * @warning the input data will be modified inplace. - * + * * @param data The input audio data * @param frame_size The frame size of input, it is also the size of single channel data * @param factor The gain factor - * + * * @return int16_t* The output audio data */ -int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor); +int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor); /** * @brief Adjust the gain of input data - * + * * @warning the input data will be modified inplace. - * + * * @param in_data The input audio data * @param in_frame_size Input data frame size of input * @param channel_num The channel number of input data, which is same as output data @@ -243,35 +247,35 @@ int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor); * @param out_frame_size Onput data frame size of input * */ -void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size); +void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size); /** * @brief Copy the afe config - * + * * @param dst_config The destination afe config * @param src_config The source afe config - * + * * @return The destination afe config */ -afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config); +afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config); /** * @brief Print the afe config - * + * * @param afe_config The afe config */ void afe_config_print(const afe_config_t *afe_config); /** * @brief Allocate afe config - * + * * @return The afe config pointer */ afe_config_t *afe_config_alloc(); /** * @brief Free afe config - * + * * @param afe_config The afe config pointer */ void afe_config_free(afe_config_t *afe_config); diff --git a/include/esp32p4/esp_afe_sr_iface.h b/include/esp32p4/esp_afe_sr_iface.h index f434c3e..580eed9 100644 --- a/include/esp32p4/esp_afe_sr_iface.h +++ b/include/esp32p4/esp_afe_sr_iface.h @@ -1,62 +1,61 @@ #pragma once +#include "esp_afe_config.h" +#include "stdbool.h" #include "stdint.h" #include "stdlib.h" -#include "stdbool.h" -#include "esp_afe_config.h" #include "freertos/FreeRTOS.h" #include "freertos/task.h" #ifdef __cplusplus extern "C" { #endif -//AFE: Audio Front-End -//SR: Speech Recognition -//afe_sr/AFE_SR: the audio front-end for speech recognition +// AFE: Audio Front-End +// SR: Speech Recognition +// afe_sr/AFE_SR: the audio front-end for speech recognition -//Opaque AFE_SR data container +// Opaque AFE_SR data container typedef struct esp_afe_sr_data_t esp_afe_sr_data_t; - - /** * @brief The state of vad */ -typedef enum -{ - AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence - AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech +typedef enum { + AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence + AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech } afe_vad_state_t; /** * @brief The result of fetch function */ -typedef struct afe_fetch_result_t -{ - int16_t *data; // the target channel data of audio. - int data_size; // the size of data. The unit is byte. - int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated. - int vad_cache_size; // the size of vad_cache. The unit is byte. - float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc). - // if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length. - wakenet_state_t wakeup_state; // the value is wakenet_state_t - int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1. - int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1. - vad_state_t vad_state; // the value is afe_vad_state_t - int trigger_channel_id; // the channel index of output - int wake_word_length; // the length of wake word. The unit is the number of samples. - int ret_value; // the return state of fetch function - int16_t *raw_data; // the multi-channel output data of audio. - int raw_data_channels; // the channel number of raw data - void* reserved; // reserved for future use +typedef struct afe_fetch_result_t { + int16_t *data; // the target channel data of audio. + int data_size; // the size of data. The unit is byte. + int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the + // audio that was truncated. + int vad_cache_size; // the size of vad_cache. The unit is byte. + float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. + // (note: invalid in vc). if enable wakenet, the window length is the receptive fields of + // wakenet(about 1.5s), otherwise is the frame length. + wakenet_state_t wakeup_state; // the value is wakenet_state_t + int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1. + int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index + // start from 1. + vad_state_t vad_state; // the value is afe_vad_state_t + int trigger_channel_id; // the channel index of output + int wake_word_length; // the length of wake word. The unit is the number of samples. + int ret_value; // the return state of fetch function + int16_t *raw_data; // the multi-channel output data of audio. + int raw_data_channels; // the channel number of raw data + void *reserved; // reserved for future use } afe_fetch_result_t; /** * @brief Function to initialze a AFE_SR instance - * + * * @param afe_config The config of AFE_SR * @returns Handle to the AFE_SR data */ -typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config); +typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config); /** * @brief Get the amount of each channel samples per frame that need to be passed to the function @@ -71,7 +70,7 @@ typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe); /** * @brief Get the channel number - * + * * @param afe The AFE_SR object to query * @return The amount of total channels */ @@ -92,12 +91,12 @@ typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe); * The last channel is reference signal if it has reference data. * * @param afe The AFE_SR object to query - * - * @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the + * + * @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the * `get_feed_chunksize`. * @return The size of input */ -typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in); +typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in); /** * @brief fetch enhanced samples of an audio stream from the AFE_SR @@ -106,9 +105,10 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* * Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`. * * @param afe The AFE_SR object to query - * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.) + * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output + * audio can be queried by the `get_fetch_chunksize`.) */ -typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe); +typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe); /** * @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch` @@ -117,9 +117,10 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *af * * @param afe The AFE_SR object to query * @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result. - * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.) + * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output + * audio can be queried by the `get_fetch_chunksize`.) */ -typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait); +typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait); /** * @brief reset ringbuf of AFE. @@ -130,14 +131,14 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe); /** - * @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient + * @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient * when wakenet has been initialized. It's only support wakenet 1 now. * * @param afe The AFE_SR object to query * @param wakenet_word The wakenet word, should be DEFAULT_WAKE_WORD or EXTRA_WAKE_WORD * @return -1: fail, 1: success */ -typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name); +typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name); /** * @brief Enable VAD algorithm. @@ -179,7 +180,6 @@ typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe); */ typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe); - /** * This structure contains the functions used to do operations on a AFE_SR. */ @@ -191,11 +191,11 @@ typedef struct { esp_afe_sr_iface_op_reset_buffer_t reset_buffer; esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize; esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize; - esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num + esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num; esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num; esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate; - esp_afe_sr_iface_op_set_wakenet_t set_wakenet; + esp_afe_sr_iface_op_set_wakenet_t set_wakenet; esp_afe_sr_iface_op_disable_func_t disable_wakenet; esp_afe_sr_iface_op_enable_func_t enable_wakenet; esp_afe_sr_iface_op_disable_func_t disable_aec; @@ -212,16 +212,14 @@ typedef struct { esp_afe_sr_iface_op_destroy_t destroy; } esp_afe_sr_iface_t; - // struct is used to store the AFE handle and data for the AFE task -typedef struct -{ +typedef struct { esp_afe_sr_data_t *afe_data; esp_afe_sr_iface_t *afe_handle; - TaskHandle_t feed_task; - TaskHandle_t fetch_task; -}afe_task_into_t; + TaskHandle_t feed_task; + TaskHandle_t fetch_task; +} afe_task_into_t; #ifdef __cplusplus } -#endif \ No newline at end of file +#endif diff --git a/include/esp32p4/esp_afe_sr_models.h b/include/esp32p4/esp_afe_sr_models.h index 05a08d3..5b37b42 100644 --- a/include/esp32p4/esp_afe_sr_models.h +++ b/include/esp32p4/esp_afe_sr_models.h @@ -10,4 +10,4 @@ esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config); #ifdef __cplusplus } -#endif \ No newline at end of file +#endif diff --git a/include/esp32p4/esp_mfcc_iface.h b/include/esp32p4/esp_mfcc_iface.h new file mode 100644 index 0000000..95e287b --- /dev/null +++ b/include/esp32p4/esp_mfcc_iface.h @@ -0,0 +1,89 @@ +#pragma once +#include +#include "esp_speech_features.h" + +/* +This describes an interface for a MFCC runner, that is, some kind of implementation that can be +fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so +multiple implementations can be used. +*/ + + +typedef struct esp_mfcc_data_t esp_mfcc_data_t; + + +//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please +//refer to its documentation for details. +typedef struct { + int winstep_ms; // The step between successive windows in ms. (10) + int winlen_ms; // The length of the analysis window in ms. (25) + int nch; // The number of input channel + int numcep; // The number of cepstrum to return + int nfilter; // The number of filters in the filterbank + int nfft; // The FFT size + int samp_freq; // The sample-rate of the signal. + int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0) + int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq + float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97) + char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey" + bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy + bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum + int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon)) + float log_epsilon; // log epsilon. (e.g. 1e-7) + bool psram_first; // Alloc memory from PSRAM first + bool remove_dc_offset; // Whether to subtract mean of wave before FFT +} esp_mfcc_opts_t; + + +/** + * @brief Un-initialize and free a mfcc runner + * + * Function to free a previously allocated mfcc runner. + * + * @param r Runner object to destroy + */ +typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r); + +/** + * @brief Initialize parameters for a mfcc runner. + * + * After creation, a mfcc runner needs to be initialized first; this is usually done + * in the initialization routine of a speech recognition algorithm. This provides + * a pointer to do this for a specific mfcc runner. + * + * @param opt Options for the mfcc process + * @return True if success, false on error. + */ +typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); + +/** + * @brief Run a mfcc iteration on frame by frame + * + * This will take a set of samples and return a ceptrum. Note that this may be pipelined: + * an initial call to this function may return NULL and subsequent calls may return the + * cepstrum of previous calls. + * + * @param r The mfcc runner + * @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000). + * @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function + * when done with this buffer. Note that some implementations require the buffer to be freed before another call + * to this function is done. + */ +typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch); + +/** + * @brief Clean all state of mfcc handle + * + * @param r The mfcc runner + */ +typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r); + +/** + * @brief Operations possible on a mfcc runner + */ +typedef struct { + esp_mfcc_op_destroy_t destroy; + esp_mfcc_op_create_t create; + esp_mfcc_op_run_step_t run_step; + esp_mfcc_op_clean_t clean; +} esp_mfcc_iface_t; diff --git a/include/esp32p4/esp_mfcc_models.h b/include/esp32p4/esp_mfcc_models.h new file mode 100644 index 0000000..f8e9119 --- /dev/null +++ b/include/esp32p4/esp_mfcc_models.h @@ -0,0 +1,40 @@ +#pragma once +#include "esp_mfcc_iface.h" + + +extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle + + +/** + * @brief Return basic opts used in wakenet9 & multinet5 + **/ +esp_mfcc_opts_t *get_mfcc_opts_wn9(); + +/** + * @brief Return basic opts for default kaldifeat + * + opts->psram_first = true; + opts->use_power = true; + opts->use_log_fbank = 2; // log(max(x, log_epsilon)) + opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps + opts->win_type = "povey"; + opts->low_freq = 20; + opts->high_freq = 7600; + opts->samp_freq = 16000; + opts->nch = 1; + opts->nfft = 512; + opts->nfilter = 80; + opts->numcep = 80; + opts->preemph = 0.97; + opts->append_energy = false; + opts->winlen_ms = 25; + opts->winstep_ms = 10; + opts->remove_dc_offset = true; + * + **/ +esp_mfcc_opts_t *get_mfcc_opts_kaldi(); + +/** + * @brief Print mfcc opts + **/ +void print_mfcc_opts(esp_mfcc_opts_t *opts); \ No newline at end of file diff --git a/include/esp32p4/esp_speech_features.h b/include/esp32p4/esp_speech_features.h new file mode 100644 index 0000000..3552f4a --- /dev/null +++ b/include/esp32p4/esp_speech_features.h @@ -0,0 +1,64 @@ +#pragma once +#include "c_speech_features_config.h" +#include "stdlib.h" +#include +#include + +#ifndef M_2PI +#define M_2PI 6.283185307179586476925286766559005 +#endif + +typedef struct +{ + float *coeff; + int *bank_pos; + int nfilter; +} esp_mel_filter_t; + +float* esp_mfcc_malloc(size_t size, bool from_psram); + +void esp_mfcc_free(void *ptr); + +/** + * @brief Initialize FFT table + * @warning For ESP-PLATFORM, use esp-dsp fft + * For Other platform, use kiss fft + * + * @param nfft The input samples number + * @return fft-table + **/ +void* esp_fft_init(int nfft); + +/** + * @brief Free FFT table + * @warning For ESP-PLATFORM, use esp-dsp fft + * For Other platform, use kiss fft + * + * @param fft_table The fft table initialized by esp_fft_init + * @param nfft The input samples number + * @return fft-table + **/ +void esp_fft_deinit(void *fft_table, int nfft); + +/** + * @brief Initial window function + * Currently support hanning, hamming, sine, povey, rectangular, + * wn9(512-hanning to get wakenet9& multinet5 compatible) + **/ +float *esp_win_func_init(char *win_type, float* window_data, int frame_length); + +float* esp_fftr(float* x, int nfft, void *fft_table); + +float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table); + +void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc); + +float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last); + +esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, + bool from_psram); + +void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter); + +float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, + float epsilon); diff --git a/include/esp32s3/c_speech_features_config.h b/include/esp32s3/c_speech_features_config.h new file mode 100644 index 0000000..e21e020 --- /dev/null +++ b/include/esp32s3/c_speech_features_config.h @@ -0,0 +1,29 @@ +#pragma once +#include +#include + +/* #undef ENABLE_DOUBLE */ + +#ifdef ENABLE_DOUBLE +# define csf_float double +# define csf_ceil ceil +# define csf_floor floor +# define csf_sin sin +# define csf_log log +# define csf_log10 log10 +# define csf_pow pow +# define csf_sqrt sqrt +# define csf_abs fabs +# define csf_float_min DBL_MIN +#else +# define csf_float float +# define csf_ceil ceilf +# define csf_floor floorf +# define csf_sin sinf +# define csf_log logf +# define csf_log10 log10f +# define csf_pow powf +# define csf_sqrt sqrtf +# define csf_abs fabsf +# define csf_float_min FLT_MIN +#endif diff --git a/include/esp32s3/esp_afe_config.h b/include/esp32s3/esp_afe_config.h index 694caa2..16906bd 100644 --- a/include/esp32s3/esp_afe_config.h +++ b/include/esp32s3/esp_afe_config.h @@ -1,241 +1,245 @@ #pragma once -#include "stdint.h" -#include "stdbool.h" -#include "stdlib.h" -#include "esp_wn_iface.h" -#include "esp_wn_models.h" -#include "esp_vad.h" #include "esp_aec.h" #include "esp_agc.h" -#include "model_path.h" -#include "esp_vadn_models.h" #include "esp_nsn_models.h" +#include "esp_vad.h" +#include "esp_vadn_models.h" +#include "esp_wn_iface.h" +#include "esp_wn_models.h" +#include "model_path.h" +#include "stdbool.h" +#include "stdint.h" +#include "stdlib.h" #ifdef __cplusplus extern "C" { #endif -//AFE: Audio Front-End -//SR: Speech Recognition -//VC: Voice Communication +// AFE: Audio Front-End +// SR: Speech Recognition +// VC: Voice Communication -//Set AFE_SR mode +// Set AFE_SR mode typedef enum { - SR_MODE_LOW_COST = 0, //Deprecated, please use afe_mode_t, AFE mode: low cost mode - SR_MODE_HIGH_PERF = 1, //Deprecated, please use afe_mode_t, AFE mode: high performance mode + SR_MODE_LOW_COST = 0, // Deprecated, please use afe_mode_t, AFE mode: low cost mode + SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode } afe_sr_mode_t; -//Set AFE mode +// Set AFE mode typedef enum { - AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode - AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode + AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode + AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode } afe_mode_t; -//Set AFE type +// Set AFE type typedef enum { - AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression - AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression + AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression + AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression } afe_type_t; typedef enum { - AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram - AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance - AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram + AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram + AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance + AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram } afe_memory_alloc_mode_t; typedef enum { - AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB - AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB - AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB - AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain + AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB + AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB + AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB + AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain } afe_mn_peak_agc_mode_t; typedef struct { - int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel - int mic_num; // microphone channel number - uint8_t* mic_ids; // microphone channel indices - int ref_num; // playback reference channel number - uint8_t* ref_ids; // playback reference channel indices - int sample_rate; // sample rate of audio + int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel + int mic_num; // microphone channel number + uint8_t *mic_ids; // microphone channel indices + int ref_num; // playback reference channel number + uint8_t *ref_ids; // playback reference channel indices + int sample_rate; // sample rate of audio } afe_pcm_config_t; typedef enum { - AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC" - AFE_NS_MODE_NET = 1, // please use model name of NSNET + AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC" + AFE_NS_MODE_NET = 1, // please use model name of NSNET } afe_ns_mode_t; typedef enum { - AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC - AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated + AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC + AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated } afe_agc_mode_t; /** * @brief Function to get the debug audio data * - * @param data The debug audio data which don't be modify. It should be copied away as soon as possible that avoid blocking for too long. + * @param data The debug audio data which don't be modify. It should be copied away as soon as possible that + * avoid blocking for too long. * @param data_size The number of bytes of data. * @returns */ -typedef void (*afe_debug_hook_callback_t)(const int16_t* data, int data_size); +typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size); typedef enum { - AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task - AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task + AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task + AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task AFE_DEBUG_HOOK_MAX = 2 } afe_debug_hook_type_t; typedef struct { - afe_debug_hook_type_t hook_type; // debug type of hook - afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data + afe_debug_hook_type_t hook_type; // debug type of hook + afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data } afe_debug_hook_t; typedef struct { /********** AEC(Acoustic Echo Cancellation) **********/ - bool aec_init; // Whether to init aec - aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF - int aec_filter_length; // The filter length of aec + bool aec_init; // Whether to init aec + aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF + int aec_filter_length; // The filter length of aec /********** SE(Speech Enhancement, microphone array processing) **********/ - bool se_init; // Whether to init se + bool se_init; // Whether to init se /********** NS(Noise Suppression) **********/ - bool ns_init; // Whether to init ns - char *ns_model_name; // Model name of ns - afe_ns_mode_t afe_ns_mode; // Model mode of ns - + bool ns_init; // Whether to init ns + char *ns_model_name; // Model name of ns + afe_ns_mode_t afe_ns_mode; // Model mode of ns + /********** VAD(Voice Activity Detection) **********/ - bool vad_init; // Whether to init vad - vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4 - char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used. - int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms - int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms - bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false - bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false + bool vad_init; // Whether to init vad + vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4 + char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used. + int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms + int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: + // 1000 ms + bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false + bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false /********** WakeNet(Wake Word Engine) **********/ bool wakenet_init; - char *wakenet_model_name; // The model name of wakenet 1 - char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2 - det_mode_t wakenet_mode; // The mode of wakenet + char *wakenet_model_name; // The model name of wakenet 1 + char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2 + det_mode_t wakenet_mode; // The mode of wakenet /********** AGC(Automatic Gain Control) **********/ - bool agc_init; // Whether to init agc - afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain. - int agc_compression_gain_db; // Compression gain in dB (default 9) - int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3) + bool agc_init; // Whether to init agc + afe_agc_mode_t + agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain. + int agc_compression_gain_db; // Compression gain in dB (default 9) + int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3) /********** General AFE(Audio Front End) parameter **********/ - afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function. - afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF - afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF - int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function. - int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function. - int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer. - afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM - float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude. + afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function. + afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function. + int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function. + int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer. + afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM + float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts + // directly on the output amplitude: out_linear_gain * amplitude. bool debug_init; - bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone - // otherwise, select channel number by wakenet + bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone + // otherwise, select channel number by wakenet } afe_config_t; /** - * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format. - * You can manually fine-tune it after creating the configuration - * - * The input format: + * @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based + * on the chip target and input format. You can manually fine-tune it after creating the configuration + * + * The input format: * M to represent the microphone channel * R to represent the playback reference channel * N to represent an unknown or unused channel - * - * For example, input_format="MMNR" indicates that the input data consists of four channels, + * + * For example, input_format="MMNR" indicates that the input data consists of four channels, * which are the microphone channel, the microphone channel, an unused channel, and the playback channel - * + * * @param input_format The input format * @param models Models from partition, which is configured by Kconfig * @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC * @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF - * + * * @return afe_config_t* The default config of afe */ afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode); /** * @brief Check AFE configuration and make sure it is correct. - * - * @warning If there is a configuration conflict, this function will modify some parameters. + * + * @warning If there is a configuration conflict, this function will modify some parameters. * The guiding behind these modifications is to maintain the highest performance of the output audio and results. * And remove the conflict between different algorithms. - * + * * For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm. * If SE(BSS) algorithm is deactivated, will only use the first microphone channel. - * + * * @param afe_config Input AFE config - * + * * @return afe_config_t* The modified AFE config */ afe_config_t *afe_config_check(afe_config_t *afe_config); /** * @brief Parse input format - * + * * @param input_format The input format, same with afe_config_init() function * @param pcm_config The pcm config - * + * * @return true if the input format is parsed successfully, otherwise false */ -bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config); +bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config); /** * @brief Parse I2S input data - * + * * @param data The input multi channel data * @param frame_size The frame size of input, it is also the size of single channel data * @param mic_data The output microphone data * @param ref_data The output playback reference data * @param pcm_config The pcm config - * + * */ -void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config); +void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config); /** * @brief Parse input data, from interleaved arrangement to contiguous arrangement - * + * * @param data The input multi channel data * @param frame_size The frame size of input, it is also the size of single channel data * @param channel_num The channel number of data * @param out_data The output data - * + * */ -void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data); +void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data); /** * @brief Format input data, from contiguous arrangement to interleaved arrangement - * + * * @param data The input multi channel data * @param frame_size The frame size of input, it is also the size of single channel data * @param channel_num The channel number of data * @param out_data The output data - * + * */ -void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data); +void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data); /** * @brief Adjust the gain of input data - * + * * @warning the input data will be modified inplace. - * + * * @param data The input audio data * @param frame_size The frame size of input, it is also the size of single channel data * @param factor The gain factor - * + * * @return int16_t* The output audio data */ -int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor); +int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor); /** * @brief Adjust the gain of input data - * + * * @warning the input data will be modified inplace. - * + * * @param in_data The input audio data * @param in_frame_size Input data frame size of input * @param channel_num The channel number of input data, which is same as output data @@ -243,35 +247,35 @@ int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor); * @param out_frame_size Onput data frame size of input * */ -void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size); +void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size); /** * @brief Copy the afe config - * + * * @param dst_config The destination afe config * @param src_config The source afe config - * + * * @return The destination afe config */ -afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config); +afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config); /** * @brief Print the afe config - * + * * @param afe_config The afe config */ void afe_config_print(const afe_config_t *afe_config); /** * @brief Allocate afe config - * + * * @return The afe config pointer */ afe_config_t *afe_config_alloc(); /** * @brief Free afe config - * + * * @param afe_config The afe config pointer */ void afe_config_free(afe_config_t *afe_config); diff --git a/include/esp32s3/esp_afe_sr_iface.h b/include/esp32s3/esp_afe_sr_iface.h index f434c3e..580eed9 100644 --- a/include/esp32s3/esp_afe_sr_iface.h +++ b/include/esp32s3/esp_afe_sr_iface.h @@ -1,62 +1,61 @@ #pragma once +#include "esp_afe_config.h" +#include "stdbool.h" #include "stdint.h" #include "stdlib.h" -#include "stdbool.h" -#include "esp_afe_config.h" #include "freertos/FreeRTOS.h" #include "freertos/task.h" #ifdef __cplusplus extern "C" { #endif -//AFE: Audio Front-End -//SR: Speech Recognition -//afe_sr/AFE_SR: the audio front-end for speech recognition +// AFE: Audio Front-End +// SR: Speech Recognition +// afe_sr/AFE_SR: the audio front-end for speech recognition -//Opaque AFE_SR data container +// Opaque AFE_SR data container typedef struct esp_afe_sr_data_t esp_afe_sr_data_t; - - /** * @brief The state of vad */ -typedef enum -{ - AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence - AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech +typedef enum { + AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence + AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech } afe_vad_state_t; /** * @brief The result of fetch function */ -typedef struct afe_fetch_result_t -{ - int16_t *data; // the target channel data of audio. - int data_size; // the size of data. The unit is byte. - int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated. - int vad_cache_size; // the size of vad_cache. The unit is byte. - float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc). - // if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length. - wakenet_state_t wakeup_state; // the value is wakenet_state_t - int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1. - int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1. - vad_state_t vad_state; // the value is afe_vad_state_t - int trigger_channel_id; // the channel index of output - int wake_word_length; // the length of wake word. The unit is the number of samples. - int ret_value; // the return state of fetch function - int16_t *raw_data; // the multi-channel output data of audio. - int raw_data_channels; // the channel number of raw data - void* reserved; // reserved for future use +typedef struct afe_fetch_result_t { + int16_t *data; // the target channel data of audio. + int data_size; // the size of data. The unit is byte. + int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the + // audio that was truncated. + int vad_cache_size; // the size of vad_cache. The unit is byte. + float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. + // (note: invalid in vc). if enable wakenet, the window length is the receptive fields of + // wakenet(about 1.5s), otherwise is the frame length. + wakenet_state_t wakeup_state; // the value is wakenet_state_t + int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1. + int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index + // start from 1. + vad_state_t vad_state; // the value is afe_vad_state_t + int trigger_channel_id; // the channel index of output + int wake_word_length; // the length of wake word. The unit is the number of samples. + int ret_value; // the return state of fetch function + int16_t *raw_data; // the multi-channel output data of audio. + int raw_data_channels; // the channel number of raw data + void *reserved; // reserved for future use } afe_fetch_result_t; /** * @brief Function to initialze a AFE_SR instance - * + * * @param afe_config The config of AFE_SR * @returns Handle to the AFE_SR data */ -typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config); +typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config); /** * @brief Get the amount of each channel samples per frame that need to be passed to the function @@ -71,7 +70,7 @@ typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe); /** * @brief Get the channel number - * + * * @param afe The AFE_SR object to query * @return The amount of total channels */ @@ -92,12 +91,12 @@ typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe); * The last channel is reference signal if it has reference data. * * @param afe The AFE_SR object to query - * - * @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the + * + * @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the * `get_feed_chunksize`. * @return The size of input */ -typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in); +typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in); /** * @brief fetch enhanced samples of an audio stream from the AFE_SR @@ -106,9 +105,10 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* * Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`. * * @param afe The AFE_SR object to query - * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.) + * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output + * audio can be queried by the `get_fetch_chunksize`.) */ -typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe); +typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe); /** * @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch` @@ -117,9 +117,10 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *af * * @param afe The AFE_SR object to query * @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result. - * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.) + * @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output + * audio can be queried by the `get_fetch_chunksize`.) */ -typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait); +typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait); /** * @brief reset ringbuf of AFE. @@ -130,14 +131,14 @@ typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe); /** - * @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient + * @brief Initial wakenet and wake words coefficient, or reset wakenet and wake words coefficient * when wakenet has been initialized. It's only support wakenet 1 now. * * @param afe The AFE_SR object to query * @param wakenet_word The wakenet word, should be DEFAULT_WAKE_WORD or EXTRA_WAKE_WORD * @return -1: fail, 1: success */ -typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name); +typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name); /** * @brief Enable VAD algorithm. @@ -179,7 +180,6 @@ typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe); */ typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe); - /** * This structure contains the functions used to do operations on a AFE_SR. */ @@ -191,11 +191,11 @@ typedef struct { esp_afe_sr_iface_op_reset_buffer_t reset_buffer; esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize; esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize; - esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num + esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num; esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num; esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate; - esp_afe_sr_iface_op_set_wakenet_t set_wakenet; + esp_afe_sr_iface_op_set_wakenet_t set_wakenet; esp_afe_sr_iface_op_disable_func_t disable_wakenet; esp_afe_sr_iface_op_enable_func_t enable_wakenet; esp_afe_sr_iface_op_disable_func_t disable_aec; @@ -212,16 +212,14 @@ typedef struct { esp_afe_sr_iface_op_destroy_t destroy; } esp_afe_sr_iface_t; - // struct is used to store the AFE handle and data for the AFE task -typedef struct -{ +typedef struct { esp_afe_sr_data_t *afe_data; esp_afe_sr_iface_t *afe_handle; - TaskHandle_t feed_task; - TaskHandle_t fetch_task; -}afe_task_into_t; + TaskHandle_t feed_task; + TaskHandle_t fetch_task; +} afe_task_into_t; #ifdef __cplusplus } -#endif \ No newline at end of file +#endif diff --git a/include/esp32s3/esp_afe_sr_models.h b/include/esp32s3/esp_afe_sr_models.h index 05a08d3..5b37b42 100644 --- a/include/esp32s3/esp_afe_sr_models.h +++ b/include/esp32s3/esp_afe_sr_models.h @@ -10,4 +10,4 @@ esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config); #ifdef __cplusplus } -#endif \ No newline at end of file +#endif diff --git a/include/esp32s3/esp_mfcc_iface.h b/include/esp32s3/esp_mfcc_iface.h new file mode 100644 index 0000000..95e287b --- /dev/null +++ b/include/esp32s3/esp_mfcc_iface.h @@ -0,0 +1,89 @@ +#pragma once +#include +#include "esp_speech_features.h" + +/* +This describes an interface for a MFCC runner, that is, some kind of implementation that can be +fed sample chunks and returns the MFCC cepstrum of those samples. This is an abstracted interface so +multiple implementations can be used. +*/ + + +typedef struct esp_mfcc_data_t esp_mfcc_data_t; + + +//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please +//refer to its documentation for details. +typedef struct { + int winstep_ms; // The step between successive windows in ms. (10) + int winlen_ms; // The length of the analysis window in ms. (25) + int nch; // The number of input channel + int numcep; // The number of cepstrum to return + int nfilter; // The number of filters in the filterbank + int nfft; // The FFT size + int samp_freq; // The sample-rate of the signal. + int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0) + int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq + float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97) + char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey" + bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy + bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum + int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon)) + float log_epsilon; // log epsilon. (e.g. 1e-7) + bool psram_first; // Alloc memory from PSRAM first + bool remove_dc_offset; // Whether to subtract mean of wave before FFT +} esp_mfcc_opts_t; + + +/** + * @brief Un-initialize and free a mfcc runner + * + * Function to free a previously allocated mfcc runner. + * + * @param r Runner object to destroy + */ +typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r); + +/** + * @brief Initialize parameters for a mfcc runner. + * + * After creation, a mfcc runner needs to be initialized first; this is usually done + * in the initialization routine of a speech recognition algorithm. This provides + * a pointer to do this for a specific mfcc runner. + * + * @param opt Options for the mfcc process + * @return True if success, false on error. + */ +typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); + +/** + * @brief Run a mfcc iteration on frame by frame + * + * This will take a set of samples and return a ceptrum. Note that this may be pipelined: + * an initial call to this function may return NULL and subsequent calls may return the + * cepstrum of previous calls. + * + * @param r The mfcc runner + * @param samp An array of signed 16-bit samples. The amount of samples should be sampfreq/(winstep_ms/1000). + * @return A set of cepstral values, or NULL if no such values are available yet. Free using the free_cepbuf function + * when done with this buffer. Note that some implementations require the buffer to be freed before another call + * to this function is done. + */ +typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch); + +/** + * @brief Clean all state of mfcc handle + * + * @param r The mfcc runner + */ +typedef void (*esp_mfcc_op_clean_t)(esp_mfcc_data_t *r); + +/** + * @brief Operations possible on a mfcc runner + */ +typedef struct { + esp_mfcc_op_destroy_t destroy; + esp_mfcc_op_create_t create; + esp_mfcc_op_run_step_t run_step; + esp_mfcc_op_clean_t clean; +} esp_mfcc_iface_t; diff --git a/include/esp32s3/esp_mfcc_models.h b/include/esp32s3/esp_mfcc_models.h new file mode 100644 index 0000000..f8e9119 --- /dev/null +++ b/include/esp32s3/esp_mfcc_models.h @@ -0,0 +1,40 @@ +#pragma once +#include "esp_mfcc_iface.h" + + +extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle + + +/** + * @brief Return basic opts used in wakenet9 & multinet5 + **/ +esp_mfcc_opts_t *get_mfcc_opts_wn9(); + +/** + * @brief Return basic opts for default kaldifeat + * + opts->psram_first = true; + opts->use_power = true; + opts->use_log_fbank = 2; // log(max(x, log_epsilon)) + opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps + opts->win_type = "povey"; + opts->low_freq = 20; + opts->high_freq = 7600; + opts->samp_freq = 16000; + opts->nch = 1; + opts->nfft = 512; + opts->nfilter = 80; + opts->numcep = 80; + opts->preemph = 0.97; + opts->append_energy = false; + opts->winlen_ms = 25; + opts->winstep_ms = 10; + opts->remove_dc_offset = true; + * + **/ +esp_mfcc_opts_t *get_mfcc_opts_kaldi(); + +/** + * @brief Print mfcc opts + **/ +void print_mfcc_opts(esp_mfcc_opts_t *opts); \ No newline at end of file diff --git a/include/esp32s3/esp_speech_features.h b/include/esp32s3/esp_speech_features.h new file mode 100644 index 0000000..3552f4a --- /dev/null +++ b/include/esp32s3/esp_speech_features.h @@ -0,0 +1,64 @@ +#pragma once +#include "c_speech_features_config.h" +#include "stdlib.h" +#include +#include + +#ifndef M_2PI +#define M_2PI 6.283185307179586476925286766559005 +#endif + +typedef struct +{ + float *coeff; + int *bank_pos; + int nfilter; +} esp_mel_filter_t; + +float* esp_mfcc_malloc(size_t size, bool from_psram); + +void esp_mfcc_free(void *ptr); + +/** + * @brief Initialize FFT table + * @warning For ESP-PLATFORM, use esp-dsp fft + * For Other platform, use kiss fft + * + * @param nfft The input samples number + * @return fft-table + **/ +void* esp_fft_init(int nfft); + +/** + * @brief Free FFT table + * @warning For ESP-PLATFORM, use esp-dsp fft + * For Other platform, use kiss fft + * + * @param fft_table The fft table initialized by esp_fft_init + * @param nfft The input samples number + * @return fft-table + **/ +void esp_fft_deinit(void *fft_table, int nfft); + +/** + * @brief Initial window function + * Currently support hanning, hamming, sine, povey, rectangular, + * wn9(512-hanning to get wakenet9& multinet5 compatible) + **/ +float *esp_win_func_init(char *win_type, float* window_data, int frame_length); + +float* esp_fftr(float* x, int nfft, void *fft_table); + +float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table); + +void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc); + +float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last); + +esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, + bool from_psram); + +void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter); + +float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, + float epsilon); diff --git a/lib/esp32/libc_speech_features.a b/lib/esp32/libc_speech_features.a index 840eeae..f3c0381 100644 Binary files a/lib/esp32/libc_speech_features.a and b/lib/esp32/libc_speech_features.a differ diff --git a/lib/esp32/libesp_audio_front_end.a b/lib/esp32/libesp_audio_front_end.a index 13e39be..3c2b9d7 100644 Binary files a/lib/esp32/libesp_audio_front_end.a and b/lib/esp32/libesp_audio_front_end.a differ diff --git a/lib/esp32/libesp_audio_processor.a b/lib/esp32/libesp_audio_processor.a index 8f3cbf1..bfaa367 100644 Binary files a/lib/esp32/libesp_audio_processor.a and b/lib/esp32/libesp_audio_processor.a differ diff --git a/lib/esp32/libmultinet.a b/lib/esp32/libmultinet.a index a246897..d5c18d9 100644 Binary files a/lib/esp32/libmultinet.a and b/lib/esp32/libmultinet.a differ diff --git a/lib/esp32/libwakenet.a b/lib/esp32/libwakenet.a index a54c0d0..5b90657 100644 Binary files a/lib/esp32/libwakenet.a and b/lib/esp32/libwakenet.a differ diff --git a/lib/esp32p4/libc_speech_features.a b/lib/esp32p4/libc_speech_features.a index c99d6c3..dee49de 100644 Binary files a/lib/esp32p4/libc_speech_features.a and b/lib/esp32p4/libc_speech_features.a differ diff --git a/lib/esp32p4/libesp_audio_front_end.a b/lib/esp32p4/libesp_audio_front_end.a index 27faac9..705fa4a 100644 Binary files a/lib/esp32p4/libesp_audio_front_end.a and b/lib/esp32p4/libesp_audio_front_end.a differ diff --git a/lib/esp32p4/libesp_audio_processor.a b/lib/esp32p4/libesp_audio_processor.a index 2601a45..4487112 100644 Binary files a/lib/esp32p4/libesp_audio_processor.a and b/lib/esp32p4/libesp_audio_processor.a differ diff --git a/lib/esp32p4/libmultinet.a b/lib/esp32p4/libmultinet.a index 408291f..1345a5c 100644 Binary files a/lib/esp32p4/libmultinet.a and b/lib/esp32p4/libmultinet.a differ diff --git a/lib/esp32p4/libvadnet.a b/lib/esp32p4/libvadnet.a index 70114e5..58bc2b0 100644 Binary files a/lib/esp32p4/libvadnet.a and b/lib/esp32p4/libvadnet.a differ diff --git a/lib/esp32p4/libwakenet.a b/lib/esp32p4/libwakenet.a index 67fe548..20890d1 100644 Binary files a/lib/esp32p4/libwakenet.a and b/lib/esp32p4/libwakenet.a differ diff --git a/lib/esp32s3/libc_speech_features.a b/lib/esp32s3/libc_speech_features.a index 46ee467..3c4f69c 100644 Binary files a/lib/esp32s3/libc_speech_features.a and b/lib/esp32s3/libc_speech_features.a differ diff --git a/lib/esp32s3/libesp_audio_front_end.a b/lib/esp32s3/libesp_audio_front_end.a index 8452944..cad62f2 100644 Binary files a/lib/esp32s3/libesp_audio_front_end.a and b/lib/esp32s3/libesp_audio_front_end.a differ diff --git a/lib/esp32s3/libesp_audio_processor.a b/lib/esp32s3/libesp_audio_processor.a index 60fb949..4df5207 100644 Binary files a/lib/esp32s3/libesp_audio_processor.a and b/lib/esp32s3/libesp_audio_processor.a differ diff --git a/lib/esp32s3/libmultinet.a b/lib/esp32s3/libmultinet.a index c26d920..891fac2 100644 Binary files a/lib/esp32s3/libmultinet.a and b/lib/esp32s3/libmultinet.a differ diff --git a/lib/esp32s3/libnsnet.a b/lib/esp32s3/libnsnet.a index 5621d1b..81e0015 100644 Binary files a/lib/esp32s3/libnsnet.a and b/lib/esp32s3/libnsnet.a differ diff --git a/lib/esp32s3/libvadnet.a b/lib/esp32s3/libvadnet.a index 685185f..40949a8 100644 Binary files a/lib/esp32s3/libvadnet.a and b/lib/esp32s3/libvadnet.a differ diff --git a/lib/esp32s3/libwakenet.a b/lib/esp32s3/libwakenet.a index 88f8846..b74c5a8 100644 Binary files a/lib/esp32s3/libwakenet.a and b/lib/esp32s3/libwakenet.a differ diff --git a/test_apps/esp-sr/main/CMakeLists.txt b/test_apps/esp-sr/main/CMakeLists.txt index d52b236..08ad7bb 100644 --- a/test_apps/esp-sr/main/CMakeLists.txt +++ b/test_apps/esp-sr/main/CMakeLists.txt @@ -4,6 +4,7 @@ set(srcs "test_wakenet.cpp" "test_multinet.cpp" "test_afe.cpp" + "test_mfcc.cpp" ) idf_component_register(SRCS ${srcs} diff --git a/test_apps/esp-sr/main/test_mfcc.cpp b/test_apps/esp-sr/main/test_mfcc.cpp new file mode 100644 index 0000000..c5dfcf2 --- /dev/null +++ b/test_apps/esp-sr/main/test_mfcc.cpp @@ -0,0 +1,63 @@ +#include +#include +#include "string.h" +#include +#include "unity.h" +#include "esp_log.h" +#include "esp_heap_caps.h" +#include "esp_mfcc_iface.h" +#include "esp_mfcc_models.h" +#include "alexa.h" + +esp_mfcc_opts_t *get_fbank_opts_kaldi(int dim) +{ + esp_mfcc_opts_t *opts = (esp_mfcc_opts_t*)malloc(sizeof(esp_mfcc_opts_t)); + opts->psram_first = true; + opts->use_power = true; + opts->use_log_fbank = 0; // log(max(x, log_epsilon)) + opts->log_epsilon = 1.1920928955078125e-07f; // torch.finfo(torch.float32).eps + opts->win_type = const_cast("hanning"); // remove [-Wwrite-strings] warning + opts->low_freq = 20; + opts->high_freq = 7600; + opts->samp_freq = 16000; + opts->nch = 1; + opts->nfft = 512; + opts->nfilter = 80; + opts->numcep = dim; + opts->preemph = 0.97; + opts->append_energy = false; + opts->winlen_ms = 25; + opts->winstep_ms = 10; + opts->remove_dc_offset = true; + + return opts; +} + + +TEST_CASE("test speech features", "[fbank]") +{ + int16_t *buffer = (int16_t *) malloc(512 * sizeof(int16_t)); + const esp_mfcc_iface_t *fbank_handle = &esp_fbank_f32; + float* fbank_out = NULL; + + int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + // MFCC init + int out_dim = 80; + esp_mfcc_data_t *fbank_model = fbank_handle->create(get_fbank_opts_kaldi(out_dim)); + memcpy(buffer, alexa, 512 * sizeof(int16_t)); + fbank_out = fbank_handle->run_step(fbank_model, buffer, 0); + fbank_handle->destroy(fbank_model); + int first_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + for (int i = 0; i < out_dim; i++) { + printf("%f ", fbank_out[i]); + } + + fbank_model = fbank_handle->create(get_fbank_opts_kaldi(out_dim)); + memcpy(buffer, alexa, 512 * sizeof(int16_t)); + fbank_out = fbank_handle->run_step(fbank_model, buffer, 0); + fbank_handle->destroy(fbank_model); + int second_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + + TEST_ASSERT_EQUAL(true, start_size - first_end_size < 100); // there are some memory leak in system + TEST_ASSERT_EQUAL(true, first_end_size == second_end_size); +}