diff --git a/CMakeLists.txt b/CMakeLists.txt index 530c6b4..830c8a2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -111,7 +111,9 @@ elseif(${IDF_TARGET} STREQUAL "esp32c5") component_compile_options(-ffast-math -O3 -Wno-error=format=-Wno-format) add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_processor.a") + add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_front_end.a") target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_processor) + target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_front_end) elseif((${IDF_TARGET} STREQUAL "esp32s2") OR (${IDF_TARGET} STREQUAL "esp32c3") OR (${IDF_TARGET} STREQUAL "esp32c6")) #Only support TTS on esp32s2, esp32c3 and esp32c6 diff --git a/include/esp32/esp_afe_aec.h b/include/esp32/esp_afe_aec.h new file mode 100644 index 0000000..9d60588 --- /dev/null +++ b/include/esp32/esp_afe_aec.h @@ -0,0 +1,82 @@ + +#ifndef _ESP_AFE_AEC_H_ +#define _ESP_AFE_AEC_H_ + + +#include "esp_afe_config.h" +#include "esp_aec.h" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + aec_handle_t* handle; + aec_mode_t mode; + afe_pcm_config_t pcm_config; + int frame_size; + int16_t *data; +}afe_aec_handle_t; + + +/** + * @brief Creates an instance to the AEC structure. + * + * @warning Currently only support 1 microphone channel and 1 playback channe. + * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected. + * + * The input format, same as afe config: + * M to represent the microphone channel + * R to represent the playback reference channel + * N to represent an unknown or unused channel + * + * For example, input_format="MMNR" indicates that the input data consists of four channels, + * which are the microphone channel, the microphone channel, an unused channel, and the playback channel + * + * @param input_format The input format + * @param filter_length The length of filter. The larger the filter, the higher the CPU loading. + * Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5. + * @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC + * @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + * + * @return afe_config_t* The default config of afe + */ +afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode); + + +/** + * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic. + * + * @param inst The instance of AEC. + * @param indata Input audio data, format is define by input_format. Note indata will be modified in function call. + * @param outdata Returns near-end signal with echo removed. + + * @return The bytes of outdata. + */ +size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata); + +/** + * @brief Get frame size of AEC (the samples of one frame) + * @param handle The instance of AEC. + * @return Frame size + */ +int afe_aec_get_chunksize(afe_aec_handle_t *handle); + + +/** + * @brief Free the AEC instance + * + * @param inst The instance of AEC. + * + * @return None + * + */ +void afe_aec_destroy(afe_aec_handle_t *handel); + +#ifdef __cplusplus +} +#endif + +#endif //_ESP_AEC_H_ diff --git a/include/esp32/esp_afe_config.h b/include/esp32/esp_afe_config.h index 16906bd..00ac15b 100644 --- a/include/esp32/esp_afe_config.h +++ b/include/esp32/esp_afe_config.h @@ -110,6 +110,8 @@ typedef struct { int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: // 1000 ms + int vad_delay_ms; // The delay of the first speech frame in ms, default: 128 ms + // If you find vad cache can not cover all speech, please increase this value. bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false diff --git a/include/esp32/esp_afe_sr_iface.h b/include/esp32/esp_afe_sr_iface.h index 580eed9..ffc6ce2 100644 --- a/include/esp32/esp_afe_sr_iface.h +++ b/include/esp32/esp_afe_sr_iface.h @@ -141,12 +141,12 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe); typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name); /** - * @brief Enable VAD algorithm. + * @brief Reset one function/module/algorithm. * * @param afe The AFE_SR object to query - * @return -1: fail, 0: disabled, 1: enabled + * @return -1: fail, 1: success */ -typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe); +typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe); /** * @brief Disable one function/module/algorithm. @@ -204,6 +204,7 @@ typedef struct { esp_afe_sr_iface_op_enable_func_t enable_se; esp_afe_sr_iface_op_disable_func_t disable_vad; esp_afe_sr_iface_op_enable_func_t enable_vad; + esp_afe_sr_iface_op_reset_op_t reset_vad; esp_afe_sr_iface_op_disable_func_t disable_ns; esp_afe_sr_iface_op_enable_func_t enable_ns; esp_afe_sr_iface_op_disable_func_t disable_agc; diff --git a/include/esp32/esp_mfcc_iface.h b/include/esp32/esp_mfcc_iface.h index 95e287b..22a5f2c 100644 --- a/include/esp32/esp_mfcc_iface.h +++ b/include/esp32/esp_mfcc_iface.h @@ -1,6 +1,6 @@ #pragma once -#include #include "esp_speech_features.h" +#include /* This describes an interface for a MFCC runner, that is, some kind of implementation that can be @@ -8,33 +8,30 @@ fed sample chunks and returns the MFCC cepstrum of those samples. This is an abs multiple implementations can be used. */ - typedef struct esp_mfcc_data_t esp_mfcc_data_t; - -//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please -//refer to its documentation for details. +// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), +// please refer to its documentation for details. typedef struct { - int winstep_ms; // The step between successive windows in ms. (10) - int winlen_ms; // The length of the analysis window in ms. (25) - int nch; // The number of input channel - int numcep; // The number of cepstrum to return - int nfilter; // The number of filters in the filterbank - int nfft; // The FFT size - int samp_freq; // The sample-rate of the signal. - int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0) - int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq - float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97) - char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey" - bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy + int winstep_ms; // The step between successive windows in ms. (10) + int winlen_ms; // The length of the analysis window in ms. (25) + int nch; // The number of input channel + int numcep; // The number of cepstrum to return + int nfilter; // The number of filters in the filterbank + int nfft; // The FFT size + int samp_freq; // The sample-rate of the signal. + int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0) + int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq + float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97) + char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey" + bool append_energy; //  If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon)) - float log_epsilon; // log epsilon. (e.g. 1e-7) + float log_epsilon; // log epsilon. (e.g. 1e-7) bool psram_first; // Alloc memory from PSRAM first - bool remove_dc_offset; // Whether to subtract mean of wave before FFT + bool remove_dc_offset; // Whether to subtract mean of wave before FFT } esp_mfcc_opts_t; - /** * @brief Un-initialize and free a mfcc runner * @@ -54,13 +51,13 @@ typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r); * @param opt Options for the mfcc process * @return True if success, false on error. */ -typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); +typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); /** * @brief Run a mfcc iteration on frame by frame * * This will take a set of samples and return a ceptrum. Note that this may be pipelined: - * an initial call to this function may return NULL and subsequent calls may return the + * an initial call to this function may return NULL and subsequent calls may return the * cepstrum of previous calls. * * @param r The mfcc runner @@ -69,7 +66,7 @@ typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); * when done with this buffer. Note that some implementations require the buffer to be freed before another call * to this function is done. */ -typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch); +typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch); /** * @brief Clean all state of mfcc handle diff --git a/include/esp32/esp_mfcc_models.h b/include/esp32/esp_mfcc_models.h index f8e9119..231603b 100644 --- a/include/esp32/esp_mfcc_models.h +++ b/include/esp32/esp_mfcc_models.h @@ -1,18 +1,16 @@ #pragma once #include "esp_mfcc_iface.h" - extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle - /** * @brief Return basic opts used in wakenet9 & multinet5 **/ esp_mfcc_opts_t *get_mfcc_opts_wn9(); /** - * @brief Return basic opts for default kaldifeat - * + * @brief Return basic opts for default kaldifeat + * opts->psram_first = true; opts->use_power = true; opts->use_log_fbank = 2; // log(max(x, log_epsilon)) @@ -37,4 +35,4 @@ esp_mfcc_opts_t *get_mfcc_opts_kaldi(); /** * @brief Print mfcc opts **/ -void print_mfcc_opts(esp_mfcc_opts_t *opts); \ No newline at end of file +void print_mfcc_opts(esp_mfcc_opts_t *opts); diff --git a/include/esp32/esp_speech_features.h b/include/esp32/esp_speech_features.h index 3552f4a..c1659f9 100644 --- a/include/esp32/esp_speech_features.h +++ b/include/esp32/esp_speech_features.h @@ -8,46 +8,45 @@ #define M_2PI 6.283185307179586476925286766559005 #endif -typedef struct -{ +typedef struct { float *coeff; int *bank_pos; int nfilter; } esp_mel_filter_t; -float* esp_mfcc_malloc(size_t size, bool from_psram); +float *esp_mfcc_malloc(size_t size, bool from_psram); void esp_mfcc_free(void *ptr); /** * @brief Initialize FFT table * @warning For ESP-PLATFORM, use esp-dsp fft - * For Other platform, use kiss fft - * - * @param nfft The input samples number + * For Other platform, use kiss fft + * + * @param nfft The input samples number * @return fft-table **/ -void* esp_fft_init(int nfft); +void *esp_fft_init(int nfft); /** * @brief Free FFT table * @warning For ESP-PLATFORM, use esp-dsp fft - * For Other platform, use kiss fft - * + * For Other platform, use kiss fft + * * @param fft_table The fft table initialized by esp_fft_init - * @param nfft The input samples number + * @param nfft The input samples number * @return fft-table **/ void esp_fft_deinit(void *fft_table, int nfft); /** * @brief Initial window function - * Currently support hanning, hamming, sine, povey, rectangular, + * Currently support hanning, hamming, sine, povey, rectangular, * wn9(512-hanning to get wakenet9& multinet5 compatible) **/ -float *esp_win_func_init(char *win_type, float* window_data, int frame_length); +float *esp_win_func_init(char *win_type, float *window_data, int frame_length); -float* esp_fftr(float* x, int nfft, void *fft_table); +float *esp_fftr(float *x, int nfft, void *fft_table); float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table); @@ -55,10 +54,9 @@ void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc); float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last); -esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, - bool from_psram); +esp_mel_filter_t *esp_mel_filter_init( + int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, bool from_psram); void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter); -float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, - float epsilon); +float *esp_mel_dotprod_step(float *x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, float epsilon); diff --git a/include/esp32/esp_vad.h b/include/esp32/esp_vad.h index f3c5dd4..7e0b144 100644 --- a/include/esp32/esp_vad.h +++ b/include/esp32/esp_vad.h @@ -20,19 +20,19 @@ extern "C" { #endif -#define SAMPLE_RATE_HZ 16000 //Supports 32000, 16000, 8000 -#define VAD_FRAME_LENGTH_MS 30 //Supports 10ms, 20ms, 30ms +#define SAMPLE_RATE_HZ 16000 // Supports 32000, 16000, 8000 +#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms /** * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more * restrictive in reporting speech. So If you want trigger more speech, please select lower mode. */ typedef enum { - VAD_MODE_0 = 0, // Normal - VAD_MODE_1, // Aggressive - VAD_MODE_2, // Very Aggressive - VAD_MODE_3, // Very Very Aggressive - VAD_MODE_4 // Very Very Very Aggressive + VAD_MODE_0 = 0, // Normal + VAD_MODE_1, // Aggressive + VAD_MODE_2, // Very Aggressive + VAD_MODE_3, // Very Very Aggressive + VAD_MODE_4 // Very Very Very Aggressive } vad_mode_t; typedef enum { @@ -51,10 +51,10 @@ typedef struct vad_trigger_tag { #define vad_MAX_LEN INT32_MAX - 1 /** * @brief Allocate wakenet trigger - * + * * @param min_speech_len Minimum frame number of speech duration * @param min_noise_len Minimum frame number of noise duration - * + * * @return Trigger pointer **/ vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len); @@ -74,19 +74,17 @@ void vad_trigger_reset(vad_trigger_t *trigger); **/ vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state); - typedef struct { vad_trigger_t *trigger; void *vad_inst; int sample_rate; int frame_size; -}vad_handle_with_trigger_t; +} vad_handle_with_trigger_t; -typedef vad_handle_with_trigger_t* vad_handle_t; +typedef vad_handle_with_trigger_t *vad_handle_t; // typedef vad_handle_tag * vad_handle_t; - /** * @brief Creates an instance to the VAD structure. * @@ -110,7 +108,8 @@ vad_handle_t vad_create(vad_mode_t vad_mode); * - NULL: Create failed * - Others: The instance of VAD */ -vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len); +vad_handle_t vad_create_with_param( + vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms); /** * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. @@ -138,6 +137,13 @@ vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, */ vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data); +/** + * @brief Reset trigger state as Silence + * + * @param handle The instance of VAD. + */ +void vad_reset_trigger(vad_handle_t handle); + /** * @brief Free the VAD instance * @@ -149,20 +155,21 @@ vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data); void vad_destroy(vad_handle_t inst); /* -* Programming Guide: -* -* @code{c} -* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to the VAD structure. -* -* while (1) { -* //Use buffer to receive the audio data from MIC. -* vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result. -* } -* -* vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process -* -* @endcode -*/ + * Programming Guide: + * + * @code{c} + * vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to + * the VAD structure. + * + * while (1) { + * //Use buffer to receive the audio data from MIC. + * vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result. + * } + * + * vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process + * + * @endcode + */ #ifdef __cplusplus } diff --git a/include/esp32c5/esp_afe_aec.h b/include/esp32c5/esp_afe_aec.h new file mode 100644 index 0000000..9d60588 --- /dev/null +++ b/include/esp32c5/esp_afe_aec.h @@ -0,0 +1,82 @@ + +#ifndef _ESP_AFE_AEC_H_ +#define _ESP_AFE_AEC_H_ + + +#include "esp_afe_config.h" +#include "esp_aec.h" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + aec_handle_t* handle; + aec_mode_t mode; + afe_pcm_config_t pcm_config; + int frame_size; + int16_t *data; +}afe_aec_handle_t; + + +/** + * @brief Creates an instance to the AEC structure. + * + * @warning Currently only support 1 microphone channel and 1 playback channe. + * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected. + * + * The input format, same as afe config: + * M to represent the microphone channel + * R to represent the playback reference channel + * N to represent an unknown or unused channel + * + * For example, input_format="MMNR" indicates that the input data consists of four channels, + * which are the microphone channel, the microphone channel, an unused channel, and the playback channel + * + * @param input_format The input format + * @param filter_length The length of filter. The larger the filter, the higher the CPU loading. + * Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5. + * @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC + * @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + * + * @return afe_config_t* The default config of afe + */ +afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode); + + +/** + * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic. + * + * @param inst The instance of AEC. + * @param indata Input audio data, format is define by input_format. Note indata will be modified in function call. + * @param outdata Returns near-end signal with echo removed. + + * @return The bytes of outdata. + */ +size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata); + +/** + * @brief Get frame size of AEC (the samples of one frame) + * @param handle The instance of AEC. + * @return Frame size + */ +int afe_aec_get_chunksize(afe_aec_handle_t *handle); + + +/** + * @brief Free the AEC instance + * + * @param inst The instance of AEC. + * + * @return None + * + */ +void afe_aec_destroy(afe_aec_handle_t *handel); + +#ifdef __cplusplus +} +#endif + +#endif //_ESP_AEC_H_ diff --git a/include/esp32c5/esp_afe_config.h b/include/esp32c5/esp_afe_config.h new file mode 100644 index 0000000..f9de6fe --- /dev/null +++ b/include/esp32c5/esp_afe_config.h @@ -0,0 +1,69 @@ +#pragma once +#include "esp_aec.h" +#include "stdbool.h" +#include "stdint.h" +#include "stdlib.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// AFE: Audio Front-End +// SR: Speech Recognition +// VC: Voice Communication + +// Set AFE_SR mode +typedef enum { + SR_MODE_LOW_COST = 0, // Deprecated, please use afe_mode_t, AFE mode: low cost mode + SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode +} afe_sr_mode_t; + +// Set AFE mode +typedef enum { + AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode + AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode +} afe_mode_t; + +// Set AFE type +typedef enum { + AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression + AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression +} afe_type_t; + +typedef enum { + AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram + AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance + AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram +} afe_memory_alloc_mode_t; + +typedef enum { + AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB + AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB + AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB + AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain +} afe_mn_peak_agc_mode_t; + +typedef struct { + int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel + int mic_num; // microphone channel number + uint8_t *mic_ids; // microphone channel indices + int ref_num; // playback reference channel number + uint8_t *ref_ids; // playback reference channel indices + int sample_rate; // sample rate of audio +} afe_pcm_config_t; + +typedef enum { + AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC" + AFE_NS_MODE_NET = 1, // please use model name of NSNET +} afe_ns_mode_t; + +typedef enum { + AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC + AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated +} afe_agc_mode_t; + + +#ifdef __cplusplus +} +#endif + diff --git a/include/esp32p4/esp_afe_aec.h b/include/esp32p4/esp_afe_aec.h new file mode 100644 index 0000000..9d60588 --- /dev/null +++ b/include/esp32p4/esp_afe_aec.h @@ -0,0 +1,82 @@ + +#ifndef _ESP_AFE_AEC_H_ +#define _ESP_AFE_AEC_H_ + + +#include "esp_afe_config.h" +#include "esp_aec.h" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + aec_handle_t* handle; + aec_mode_t mode; + afe_pcm_config_t pcm_config; + int frame_size; + int16_t *data; +}afe_aec_handle_t; + + +/** + * @brief Creates an instance to the AEC structure. + * + * @warning Currently only support 1 microphone channel and 1 playback channe. + * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected. + * + * The input format, same as afe config: + * M to represent the microphone channel + * R to represent the playback reference channel + * N to represent an unknown or unused channel + * + * For example, input_format="MMNR" indicates that the input data consists of four channels, + * which are the microphone channel, the microphone channel, an unused channel, and the playback channel + * + * @param input_format The input format + * @param filter_length The length of filter. The larger the filter, the higher the CPU loading. + * Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5. + * @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC + * @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + * + * @return afe_config_t* The default config of afe + */ +afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode); + + +/** + * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic. + * + * @param inst The instance of AEC. + * @param indata Input audio data, format is define by input_format. Note indata will be modified in function call. + * @param outdata Returns near-end signal with echo removed. + + * @return The bytes of outdata. + */ +size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata); + +/** + * @brief Get frame size of AEC (the samples of one frame) + * @param handle The instance of AEC. + * @return Frame size + */ +int afe_aec_get_chunksize(afe_aec_handle_t *handle); + + +/** + * @brief Free the AEC instance + * + * @param inst The instance of AEC. + * + * @return None + * + */ +void afe_aec_destroy(afe_aec_handle_t *handel); + +#ifdef __cplusplus +} +#endif + +#endif //_ESP_AEC_H_ diff --git a/include/esp32p4/esp_mfcc_iface.h b/include/esp32p4/esp_mfcc_iface.h index 95e287b..22a5f2c 100644 --- a/include/esp32p4/esp_mfcc_iface.h +++ b/include/esp32p4/esp_mfcc_iface.h @@ -1,6 +1,6 @@ #pragma once -#include #include "esp_speech_features.h" +#include /* This describes an interface for a MFCC runner, that is, some kind of implementation that can be @@ -8,33 +8,30 @@ fed sample chunks and returns the MFCC cepstrum of those samples. This is an abs multiple implementations can be used. */ - typedef struct esp_mfcc_data_t esp_mfcc_data_t; - -//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please -//refer to its documentation for details. +// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), +// please refer to its documentation for details. typedef struct { - int winstep_ms; // The step between successive windows in ms. (10) - int winlen_ms; // The length of the analysis window in ms. (25) - int nch; // The number of input channel - int numcep; // The number of cepstrum to return - int nfilter; // The number of filters in the filterbank - int nfft; // The FFT size - int samp_freq; // The sample-rate of the signal. - int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0) - int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq - float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97) - char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey" - bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy + int winstep_ms; // The step between successive windows in ms. (10) + int winlen_ms; // The length of the analysis window in ms. (25) + int nch; // The number of input channel + int numcep; // The number of cepstrum to return + int nfilter; // The number of filters in the filterbank + int nfft; // The FFT size + int samp_freq; // The sample-rate of the signal. + int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0) + int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq + float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97) + char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey" + bool append_energy; //  If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon)) - float log_epsilon; // log epsilon. (e.g. 1e-7) + float log_epsilon; // log epsilon. (e.g. 1e-7) bool psram_first; // Alloc memory from PSRAM first - bool remove_dc_offset; // Whether to subtract mean of wave before FFT + bool remove_dc_offset; // Whether to subtract mean of wave before FFT } esp_mfcc_opts_t; - /** * @brief Un-initialize and free a mfcc runner * @@ -54,13 +51,13 @@ typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r); * @param opt Options for the mfcc process * @return True if success, false on error. */ -typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); +typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); /** * @brief Run a mfcc iteration on frame by frame * * This will take a set of samples and return a ceptrum. Note that this may be pipelined: - * an initial call to this function may return NULL and subsequent calls may return the + * an initial call to this function may return NULL and subsequent calls may return the * cepstrum of previous calls. * * @param r The mfcc runner @@ -69,7 +66,7 @@ typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); * when done with this buffer. Note that some implementations require the buffer to be freed before another call * to this function is done. */ -typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch); +typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch); /** * @brief Clean all state of mfcc handle diff --git a/include/esp32p4/esp_mfcc_models.h b/include/esp32p4/esp_mfcc_models.h index f8e9119..231603b 100644 --- a/include/esp32p4/esp_mfcc_models.h +++ b/include/esp32p4/esp_mfcc_models.h @@ -1,18 +1,16 @@ #pragma once #include "esp_mfcc_iface.h" - extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle - /** * @brief Return basic opts used in wakenet9 & multinet5 **/ esp_mfcc_opts_t *get_mfcc_opts_wn9(); /** - * @brief Return basic opts for default kaldifeat - * + * @brief Return basic opts for default kaldifeat + * opts->psram_first = true; opts->use_power = true; opts->use_log_fbank = 2; // log(max(x, log_epsilon)) @@ -37,4 +35,4 @@ esp_mfcc_opts_t *get_mfcc_opts_kaldi(); /** * @brief Print mfcc opts **/ -void print_mfcc_opts(esp_mfcc_opts_t *opts); \ No newline at end of file +void print_mfcc_opts(esp_mfcc_opts_t *opts); diff --git a/include/esp32p4/esp_speech_features.h b/include/esp32p4/esp_speech_features.h index 3552f4a..c1659f9 100644 --- a/include/esp32p4/esp_speech_features.h +++ b/include/esp32p4/esp_speech_features.h @@ -8,46 +8,45 @@ #define M_2PI 6.283185307179586476925286766559005 #endif -typedef struct -{ +typedef struct { float *coeff; int *bank_pos; int nfilter; } esp_mel_filter_t; -float* esp_mfcc_malloc(size_t size, bool from_psram); +float *esp_mfcc_malloc(size_t size, bool from_psram); void esp_mfcc_free(void *ptr); /** * @brief Initialize FFT table * @warning For ESP-PLATFORM, use esp-dsp fft - * For Other platform, use kiss fft - * - * @param nfft The input samples number + * For Other platform, use kiss fft + * + * @param nfft The input samples number * @return fft-table **/ -void* esp_fft_init(int nfft); +void *esp_fft_init(int nfft); /** * @brief Free FFT table * @warning For ESP-PLATFORM, use esp-dsp fft - * For Other platform, use kiss fft - * + * For Other platform, use kiss fft + * * @param fft_table The fft table initialized by esp_fft_init - * @param nfft The input samples number + * @param nfft The input samples number * @return fft-table **/ void esp_fft_deinit(void *fft_table, int nfft); /** * @brief Initial window function - * Currently support hanning, hamming, sine, povey, rectangular, + * Currently support hanning, hamming, sine, povey, rectangular, * wn9(512-hanning to get wakenet9& multinet5 compatible) **/ -float *esp_win_func_init(char *win_type, float* window_data, int frame_length); +float *esp_win_func_init(char *win_type, float *window_data, int frame_length); -float* esp_fftr(float* x, int nfft, void *fft_table); +float *esp_fftr(float *x, int nfft, void *fft_table); float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table); @@ -55,10 +54,9 @@ void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc); float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last); -esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, - bool from_psram); +esp_mel_filter_t *esp_mel_filter_init( + int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, bool from_psram); void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter); -float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, - float epsilon); +float *esp_mel_dotprod_step(float *x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, float epsilon); diff --git a/include/esp32p4/esp_vad.h b/include/esp32p4/esp_vad.h index 0c7f734..7e0b144 100644 --- a/include/esp32p4/esp_vad.h +++ b/include/esp32p4/esp_vad.h @@ -20,19 +20,19 @@ extern "C" { #endif -#define SAMPLE_RATE_HZ 16000 //Supports 32000, 16000, 8000 -#define VAD_FRAME_LENGTH_MS 30 //Supports 10ms, 20ms, 30ms +#define SAMPLE_RATE_HZ 16000 // Supports 32000, 16000, 8000 +#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms /** * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more * restrictive in reporting speech. So If you want trigger more speech, please select lower mode. */ typedef enum { - VAD_MODE_0 = 0, // Normal - VAD_MODE_1, // Aggressive - VAD_MODE_2, // Very Aggressive - VAD_MODE_3, // Very Very Aggressive - VAD_MODE_4 // Very Very Very Aggressive + VAD_MODE_0 = 0, // Normal + VAD_MODE_1, // Aggressive + VAD_MODE_2, // Very Aggressive + VAD_MODE_3, // Very Very Aggressive + VAD_MODE_4 // Very Very Very Aggressive } vad_mode_t; typedef enum { @@ -51,10 +51,10 @@ typedef struct vad_trigger_tag { #define vad_MAX_LEN INT32_MAX - 1 /** * @brief Allocate wakenet trigger - * + * * @param min_speech_len Minimum frame number of speech duration * @param min_noise_len Minimum frame number of noise duration - * + * * @return Trigger pointer **/ vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len); @@ -74,19 +74,17 @@ void vad_trigger_reset(vad_trigger_t *trigger); **/ vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state); - typedef struct { vad_trigger_t *trigger; void *vad_inst; int sample_rate; int frame_size; -}vad_handle_with_trigger_t; +} vad_handle_with_trigger_t; -typedef vad_handle_with_trigger_t* vad_handle_t; +typedef vad_handle_with_trigger_t *vad_handle_t; // typedef vad_handle_tag * vad_handle_t; - /** * @brief Creates an instance to the VAD structure. * @@ -110,7 +108,8 @@ vad_handle_t vad_create(vad_mode_t vad_mode); * - NULL: Create failed * - Others: The instance of VAD */ -vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms); +vad_handle_t vad_create_with_param( + vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms); /** * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. @@ -156,20 +155,21 @@ void vad_reset_trigger(vad_handle_t handle); void vad_destroy(vad_handle_t inst); /* -* Programming Guide: -* -* @code{c} -* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to the VAD structure. -* -* while (1) { -* //Use buffer to receive the audio data from MIC. -* vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result. -* } -* -* vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process -* -* @endcode -*/ + * Programming Guide: + * + * @code{c} + * vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to + * the VAD structure. + * + * while (1) { + * //Use buffer to receive the audio data from MIC. + * vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result. + * } + * + * vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process + * + * @endcode + */ #ifdef __cplusplus } diff --git a/include/esp32s3/esp_afe_aec.h b/include/esp32s3/esp_afe_aec.h new file mode 100644 index 0000000..9d60588 --- /dev/null +++ b/include/esp32s3/esp_afe_aec.h @@ -0,0 +1,82 @@ + +#ifndef _ESP_AFE_AEC_H_ +#define _ESP_AFE_AEC_H_ + + +#include "esp_afe_config.h" +#include "esp_aec.h" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct { + aec_handle_t* handle; + aec_mode_t mode; + afe_pcm_config_t pcm_config; + int frame_size; + int16_t *data; +}afe_aec_handle_t; + + +/** + * @brief Creates an instance to the AEC structure. + * + * @warning Currently only support 1 microphone channel and 1 playback channe. + * If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected. + * + * The input format, same as afe config: + * M to represent the microphone channel + * R to represent the playback reference channel + * N to represent an unknown or unused channel + * + * For example, input_format="MMNR" indicates that the input data consists of four channels, + * which are the microphone channel, the microphone channel, an unused channel, and the playback channel + * + * @param input_format The input format + * @param filter_length The length of filter. The larger the filter, the higher the CPU loading. + * Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5. + * @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC + * @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF + * + * @return afe_config_t* The default config of afe + */ +afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode); + + +/** + * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic. + * + * @param inst The instance of AEC. + * @param indata Input audio data, format is define by input_format. Note indata will be modified in function call. + * @param outdata Returns near-end signal with echo removed. + + * @return The bytes of outdata. + */ +size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata); + +/** + * @brief Get frame size of AEC (the samples of one frame) + * @param handle The instance of AEC. + * @return Frame size + */ +int afe_aec_get_chunksize(afe_aec_handle_t *handle); + + +/** + * @brief Free the AEC instance + * + * @param inst The instance of AEC. + * + * @return None + * + */ +void afe_aec_destroy(afe_aec_handle_t *handel); + +#ifdef __cplusplus +} +#endif + +#endif //_ESP_AEC_H_ diff --git a/include/esp32s3/esp_mfcc_iface.h b/include/esp32s3/esp_mfcc_iface.h index 95e287b..22a5f2c 100644 --- a/include/esp32s3/esp_mfcc_iface.h +++ b/include/esp32s3/esp_mfcc_iface.h @@ -1,6 +1,6 @@ #pragma once -#include #include "esp_speech_features.h" +#include /* This describes an interface for a MFCC runner, that is, some kind of implementation that can be @@ -8,33 +8,30 @@ fed sample chunks and returns the MFCC cepstrum of those samples. This is an abs multiple implementations can be used. */ - typedef struct esp_mfcc_data_t esp_mfcc_data_t; - -//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please -//refer to its documentation for details. +// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), +// please refer to its documentation for details. typedef struct { - int winstep_ms; // The step between successive windows in ms. (10) - int winlen_ms; // The length of the analysis window in ms. (25) - int nch; // The number of input channel - int numcep; // The number of cepstrum to return - int nfilter; // The number of filters in the filterbank - int nfft; // The FFT size - int samp_freq; // The sample-rate of the signal. - int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0) - int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq - float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97) - char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey" - bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy + int winstep_ms; // The step between successive windows in ms. (10) + int winlen_ms; // The length of the analysis window in ms. (25) + int nch; // The number of input channel + int numcep; // The number of cepstrum to return + int nfilter; // The number of filters in the filterbank + int nfft; // The FFT size + int samp_freq; // The sample-rate of the signal. + int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0) + int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq + float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97) + char *win_type; // Analysis window type to apply to each frame, "hanning","hamming","sine","rectangular","povey" + bool append_energy; //  If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon)) - float log_epsilon; // log epsilon. (e.g. 1e-7) + float log_epsilon; // log epsilon. (e.g. 1e-7) bool psram_first; // Alloc memory from PSRAM first - bool remove_dc_offset; // Whether to subtract mean of wave before FFT + bool remove_dc_offset; // Whether to subtract mean of wave before FFT } esp_mfcc_opts_t; - /** * @brief Un-initialize and free a mfcc runner * @@ -54,13 +51,13 @@ typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r); * @param opt Options for the mfcc process * @return True if success, false on error. */ -typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); +typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); /** * @brief Run a mfcc iteration on frame by frame * * This will take a set of samples and return a ceptrum. Note that this may be pipelined: - * an initial call to this function may return NULL and subsequent calls may return the + * an initial call to this function may return NULL and subsequent calls may return the * cepstrum of previous calls. * * @param r The mfcc runner @@ -69,7 +66,7 @@ typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt); * when done with this buffer. Note that some implementations require the buffer to be freed before another call * to this function is done. */ -typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch); +typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch); /** * @brief Clean all state of mfcc handle diff --git a/include/esp32s3/esp_mfcc_models.h b/include/esp32s3/esp_mfcc_models.h index f8e9119..231603b 100644 --- a/include/esp32s3/esp_mfcc_models.h +++ b/include/esp32s3/esp_mfcc_models.h @@ -1,18 +1,16 @@ #pragma once #include "esp_mfcc_iface.h" - extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle - /** * @brief Return basic opts used in wakenet9 & multinet5 **/ esp_mfcc_opts_t *get_mfcc_opts_wn9(); /** - * @brief Return basic opts for default kaldifeat - * + * @brief Return basic opts for default kaldifeat + * opts->psram_first = true; opts->use_power = true; opts->use_log_fbank = 2; // log(max(x, log_epsilon)) @@ -37,4 +35,4 @@ esp_mfcc_opts_t *get_mfcc_opts_kaldi(); /** * @brief Print mfcc opts **/ -void print_mfcc_opts(esp_mfcc_opts_t *opts); \ No newline at end of file +void print_mfcc_opts(esp_mfcc_opts_t *opts); diff --git a/include/esp32s3/esp_speech_features.h b/include/esp32s3/esp_speech_features.h index 3552f4a..c1659f9 100644 --- a/include/esp32s3/esp_speech_features.h +++ b/include/esp32s3/esp_speech_features.h @@ -8,46 +8,45 @@ #define M_2PI 6.283185307179586476925286766559005 #endif -typedef struct -{ +typedef struct { float *coeff; int *bank_pos; int nfilter; } esp_mel_filter_t; -float* esp_mfcc_malloc(size_t size, bool from_psram); +float *esp_mfcc_malloc(size_t size, bool from_psram); void esp_mfcc_free(void *ptr); /** * @brief Initialize FFT table * @warning For ESP-PLATFORM, use esp-dsp fft - * For Other platform, use kiss fft - * - * @param nfft The input samples number + * For Other platform, use kiss fft + * + * @param nfft The input samples number * @return fft-table **/ -void* esp_fft_init(int nfft); +void *esp_fft_init(int nfft); /** * @brief Free FFT table * @warning For ESP-PLATFORM, use esp-dsp fft - * For Other platform, use kiss fft - * + * For Other platform, use kiss fft + * * @param fft_table The fft table initialized by esp_fft_init - * @param nfft The input samples number + * @param nfft The input samples number * @return fft-table **/ void esp_fft_deinit(void *fft_table, int nfft); /** * @brief Initial window function - * Currently support hanning, hamming, sine, povey, rectangular, + * Currently support hanning, hamming, sine, povey, rectangular, * wn9(512-hanning to get wakenet9& multinet5 compatible) **/ -float *esp_win_func_init(char *win_type, float* window_data, int frame_length); +float *esp_win_func_init(char *win_type, float *window_data, int frame_length); -float* esp_fftr(float* x, int nfft, void *fft_table); +float *esp_fftr(float *x, int nfft, void *fft_table); float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table); @@ -55,10 +54,9 @@ void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc); float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last); -esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, - bool from_psram); +esp_mel_filter_t *esp_mel_filter_init( + int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, bool from_psram); void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter); -float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, - float epsilon); +float *esp_mel_dotprod_step(float *x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, float epsilon); diff --git a/include/esp32s3/esp_vad.h b/include/esp32s3/esp_vad.h index 0c7f734..7e0b144 100644 --- a/include/esp32s3/esp_vad.h +++ b/include/esp32s3/esp_vad.h @@ -20,19 +20,19 @@ extern "C" { #endif -#define SAMPLE_RATE_HZ 16000 //Supports 32000, 16000, 8000 -#define VAD_FRAME_LENGTH_MS 30 //Supports 10ms, 20ms, 30ms +#define SAMPLE_RATE_HZ 16000 // Supports 32000, 16000, 8000 +#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms /** * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more * restrictive in reporting speech. So If you want trigger more speech, please select lower mode. */ typedef enum { - VAD_MODE_0 = 0, // Normal - VAD_MODE_1, // Aggressive - VAD_MODE_2, // Very Aggressive - VAD_MODE_3, // Very Very Aggressive - VAD_MODE_4 // Very Very Very Aggressive + VAD_MODE_0 = 0, // Normal + VAD_MODE_1, // Aggressive + VAD_MODE_2, // Very Aggressive + VAD_MODE_3, // Very Very Aggressive + VAD_MODE_4 // Very Very Very Aggressive } vad_mode_t; typedef enum { @@ -51,10 +51,10 @@ typedef struct vad_trigger_tag { #define vad_MAX_LEN INT32_MAX - 1 /** * @brief Allocate wakenet trigger - * + * * @param min_speech_len Minimum frame number of speech duration * @param min_noise_len Minimum frame number of noise duration - * + * * @return Trigger pointer **/ vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len); @@ -74,19 +74,17 @@ void vad_trigger_reset(vad_trigger_t *trigger); **/ vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state); - typedef struct { vad_trigger_t *trigger; void *vad_inst; int sample_rate; int frame_size; -}vad_handle_with_trigger_t; +} vad_handle_with_trigger_t; -typedef vad_handle_with_trigger_t* vad_handle_t; +typedef vad_handle_with_trigger_t *vad_handle_t; // typedef vad_handle_tag * vad_handle_t; - /** * @brief Creates an instance to the VAD structure. * @@ -110,7 +108,8 @@ vad_handle_t vad_create(vad_mode_t vad_mode); * - NULL: Create failed * - Others: The instance of VAD */ -vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms); +vad_handle_t vad_create_with_param( + vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms); /** * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. @@ -156,20 +155,21 @@ void vad_reset_trigger(vad_handle_t handle); void vad_destroy(vad_handle_t inst); /* -* Programming Guide: -* -* @code{c} -* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to the VAD structure. -* -* while (1) { -* //Use buffer to receive the audio data from MIC. -* vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result. -* } -* -* vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process -* -* @endcode -*/ + * Programming Guide: + * + * @code{c} + * vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to + * the VAD structure. + * + * while (1) { + * //Use buffer to receive the audio data from MIC. + * vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result. + * } + * + * vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process + * + * @endcode + */ #ifdef __cplusplus } diff --git a/lib/esp32/libc_speech_features.a b/lib/esp32/libc_speech_features.a index f3c0381..8cc82ff 100644 Binary files a/lib/esp32/libc_speech_features.a and b/lib/esp32/libc_speech_features.a differ diff --git a/lib/esp32/libesp_audio_front_end.a b/lib/esp32/libesp_audio_front_end.a index 3c2b9d7..f8e42cb 100644 Binary files a/lib/esp32/libesp_audio_front_end.a and b/lib/esp32/libesp_audio_front_end.a differ diff --git a/lib/esp32/libesp_audio_processor.a b/lib/esp32/libesp_audio_processor.a index bfaa367..6dc8766 100644 Binary files a/lib/esp32/libesp_audio_processor.a and b/lib/esp32/libesp_audio_processor.a differ diff --git a/lib/esp32/libmultinet.a b/lib/esp32/libmultinet.a index d5c18d9..8d154c4 100644 Binary files a/lib/esp32/libmultinet.a and b/lib/esp32/libmultinet.a differ diff --git a/lib/esp32/libwakenet.a b/lib/esp32/libwakenet.a index 5b90657..7222a23 100644 Binary files a/lib/esp32/libwakenet.a and b/lib/esp32/libwakenet.a differ diff --git a/lib/esp32c5/libesp_audio_front_end.a b/lib/esp32c5/libesp_audio_front_end.a new file mode 100644 index 0000000..450f76c Binary files /dev/null and b/lib/esp32c5/libesp_audio_front_end.a differ diff --git a/lib/esp32p4/libc_speech_features.a b/lib/esp32p4/libc_speech_features.a index dee49de..c913ab1 100644 Binary files a/lib/esp32p4/libc_speech_features.a and b/lib/esp32p4/libc_speech_features.a differ diff --git a/lib/esp32p4/libesp_audio_front_end.a b/lib/esp32p4/libesp_audio_front_end.a index cce5d29..07aca44 100644 Binary files a/lib/esp32p4/libesp_audio_front_end.a and b/lib/esp32p4/libesp_audio_front_end.a differ diff --git a/lib/esp32p4/libesp_audio_processor.a b/lib/esp32p4/libesp_audio_processor.a index f32dd0f..b8b7cc0 100644 Binary files a/lib/esp32p4/libesp_audio_processor.a and b/lib/esp32p4/libesp_audio_processor.a differ diff --git a/lib/esp32p4/libmultinet.a b/lib/esp32p4/libmultinet.a index 16dca4f..18c8743 100644 Binary files a/lib/esp32p4/libmultinet.a and b/lib/esp32p4/libmultinet.a differ diff --git a/lib/esp32p4/libvadnet.a b/lib/esp32p4/libvadnet.a index 8c3424e..5d1b4f6 100644 Binary files a/lib/esp32p4/libvadnet.a and b/lib/esp32p4/libvadnet.a differ diff --git a/lib/esp32p4/libwakenet.a b/lib/esp32p4/libwakenet.a index 6eba4cd..6e78226 100644 Binary files a/lib/esp32p4/libwakenet.a and b/lib/esp32p4/libwakenet.a differ diff --git a/lib/esp32s3/libc_speech_features.a b/lib/esp32s3/libc_speech_features.a index 3c4f69c..924d26d 100644 Binary files a/lib/esp32s3/libc_speech_features.a and b/lib/esp32s3/libc_speech_features.a differ diff --git a/lib/esp32s3/libdl_lib.a b/lib/esp32s3/libdl_lib.a index f27412e..fb42317 100644 Binary files a/lib/esp32s3/libdl_lib.a and b/lib/esp32s3/libdl_lib.a differ diff --git a/lib/esp32s3/libesp_audio_front_end.a b/lib/esp32s3/libesp_audio_front_end.a index 4089104..885518c 100644 Binary files a/lib/esp32s3/libesp_audio_front_end.a and b/lib/esp32s3/libesp_audio_front_end.a differ diff --git a/lib/esp32s3/libesp_audio_processor.a b/lib/esp32s3/libesp_audio_processor.a index d113daf..6676ed4 100644 Binary files a/lib/esp32s3/libesp_audio_processor.a and b/lib/esp32s3/libesp_audio_processor.a differ diff --git a/lib/esp32s3/libflite_g2p.a b/lib/esp32s3/libflite_g2p.a index 6a99a57..44fbb79 100644 Binary files a/lib/esp32s3/libflite_g2p.a and b/lib/esp32s3/libflite_g2p.a differ diff --git a/lib/esp32s3/libfst.a b/lib/esp32s3/libfst.a index a2dd373..3164c92 100644 Binary files a/lib/esp32s3/libfst.a and b/lib/esp32s3/libfst.a differ diff --git a/lib/esp32s3/libhufzip.a b/lib/esp32s3/libhufzip.a index c0465b1..b9751f5 100644 Binary files a/lib/esp32s3/libhufzip.a and b/lib/esp32s3/libhufzip.a differ diff --git a/lib/esp32s3/libmultinet.a b/lib/esp32s3/libmultinet.a index 62e7576..5f81799 100644 Binary files a/lib/esp32s3/libmultinet.a and b/lib/esp32s3/libmultinet.a differ diff --git a/lib/esp32s3/libnsnet.a b/lib/esp32s3/libnsnet.a index f396b67..44c9c88 100644 Binary files a/lib/esp32s3/libnsnet.a and b/lib/esp32s3/libnsnet.a differ diff --git a/lib/esp32s3/libvadnet.a b/lib/esp32s3/libvadnet.a index 533e9bd..bd2a26d 100644 Binary files a/lib/esp32s3/libvadnet.a and b/lib/esp32s3/libvadnet.a differ diff --git a/lib/esp32s3/libwakenet.a b/lib/esp32s3/libwakenet.a index 0ff21b1..bb79b70 100644 Binary files a/lib/esp32s3/libwakenet.a and b/lib/esp32s3/libwakenet.a differ diff --git a/test_apps/esp-sr/main/test_afe.cpp b/test_apps/esp-sr/main/test_afe.cpp index ff20efe..edc5ace 100644 --- a/test_apps/esp-sr/main/test_afe.cpp +++ b/test_apps/esp-sr/main/test_afe.cpp @@ -18,6 +18,7 @@ #include "esp_wn_models.h" #include "esp_afe_sr_models.h" #include "dl_lib_convq_queue.h" +#include "esp_afe_aec.h" #include #if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4) @@ -297,4 +298,25 @@ TEST_CASE("afe performance test (2ch)", "[afe_perf]") afe_config_free(afe_config); } esp_srmodel_deinit(models); +} + + +TEST_CASE("test afe aec interface", "[afe]") +{ + int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + + afe_aec_handle_t *handle = afe_aec_create("MNR", 4, AFE_TYPE_SR, AFE_MODE_HIGH_PERF); + int frame_bytes = handle->frame_size * sizeof(int16_t); + int16_t *indata = (int16_t *) malloc(frame_bytes*handle->pcm_config.total_ch_num); + int16_t *outdata = (int16_t *) malloc(frame_bytes); + + afe_aec_process(handle, indata, outdata); + afe_aec_process(handle, indata, outdata); + afe_aec_process(handle, indata, outdata); + + afe_aec_destroy(handle); + free(indata); + free(outdata); + int end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + TEST_ASSERT_EQUAL(true, end_size == start_size); } \ No newline at end of file diff --git a/test_apps/esp32c5/main/test_aec.cpp b/test_apps/esp32c5/main/test_aec.cpp index df1a937..ee18a0b 100644 --- a/test_apps/esp32c5/main/test_aec.cpp +++ b/test_apps/esp32c5/main/test_aec.cpp @@ -12,10 +12,64 @@ #include "freertos/FreeRTOS.h" #include "freertos/task.h" #include "esp_aec.h" +#include "esp_afe_aec.h" #include "audio_test_file.h" #include "unity.h" #include "esp_timer.h" + +TEST_CASE("test esp32c5 afe aec interface", "[aec]") +{ + // vad_handle_t vad_handle = (vad_handle_t)arg; + heap_caps_print_heap_info(MALLOC_CAP_8BIT); + int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + int start_internal_size = heap_caps_get_free_size(MALLOC_CAP_INTERNAL); + int sample_rate = 16000; + + afe_aec_handle_t *aec_handle = afe_aec_create("MR", 2, AFE_TYPE_SR, AFE_MODE_LOW_COST); + afe_aec_destroy(aec_handle); + int first_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + printf("memory leak for first init: %d\n", start_size - first_end_size); + + aec_handle = afe_aec_create("MR", 2, AFE_TYPE_SR, AFE_MODE_LOW_COST); + int audio_chunksize = afe_aec_get_chunksize(aec_handle); + printf("audio chunksize:%d\n", audio_chunksize); //512 + int16_t *buffer = (int16_t *)malloc(audio_chunksize * sizeof(int16_t)*2); + int16_t *out_buffer = (int16_t *)malloc(audio_chunksize * sizeof(int16_t)); + + int chunks = 0; + uint32_t c0, c1, c_res = 0; + while (1) { + if ((chunks + 1)*audio_chunksize * sizeof(int16_t) <= sizeof(audio_mic_file)) { + memcpy(buffer, audio_mic_file + chunks * audio_chunksize , audio_chunksize * sizeof(int16_t)); + memcpy(buffer+audio_chunksize, audio_ref_file + chunks * audio_chunksize , audio_chunksize * sizeof(int16_t)); + } else { + break; + } + + c0 = esp_timer_get_time(); + afe_aec_process(aec_handle, buffer, out_buffer); + c1 = esp_timer_get_time(); + + c_res += c1 - c0; + chunks++; + } + + free(buffer); + free(out_buffer); + printf("RAM size after vad detection: total:%d, internal:%d\n", + start_size - heap_caps_get_free_size(MALLOC_CAP_8BIT), + start_internal_size - heap_caps_get_free_size(MALLOC_CAP_INTERNAL)); + printf("Done! Took %ld ms to parse %d ms worth of samples in %d iterations.\n", + c_res/1000, chunks*audio_chunksize*1000/sample_rate, chunks); + afe_aec_destroy(aec_handle); + + int end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT); + printf("memory leak:%d\n", start_size-end_size); + TEST_ASSERT_EQUAL(true, end_size == start_size); +} + + TEST_CASE("test esp32c5 aec", "[aec]") { // vad_handle_t vad_handle = (vad_handle_t)arg; @@ -68,3 +122,6 @@ TEST_CASE("test esp32c5 aec", "[aec]") printf("memory leak:%d\n", start_size-end_size); TEST_ASSERT_EQUAL(true, end_size == start_size); } + + +