feat: add afe aec interface

This commit is contained in:
xysun 2025-02-14 17:57:09 +08:00
parent a8b77d0795
commit b485bb4061
45 changed files with 689 additions and 222 deletions

View File

@ -111,7 +111,9 @@ elseif(${IDF_TARGET} STREQUAL "esp32c5")
component_compile_options(-ffast-math -O3 -Wno-error=format=-Wno-format)
add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_processor.a")
add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_front_end.a")
target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_processor)
target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_front_end)
elseif((${IDF_TARGET} STREQUAL "esp32s2") OR (${IDF_TARGET} STREQUAL "esp32c3") OR (${IDF_TARGET} STREQUAL "esp32c6"))
#Only support TTS on esp32s2, esp32c3 and esp32c6

View File

@ -0,0 +1,82 @@
#ifndef _ESP_AFE_AEC_H_
#define _ESP_AFE_AEC_H_
#include "esp_afe_config.h"
#include "esp_aec.h"
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
aec_handle_t* handle;
aec_mode_t mode;
afe_pcm_config_t pcm_config;
int frame_size;
int16_t *data;
}afe_aec_handle_t;
/**
* @brief Creates an instance to the AEC structure.
*
* @warning Currently only support 1 microphone channel and 1 playback channe.
* If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
*
* The input format, same as afe config:
* M to represent the microphone channel
* R to represent the playback reference channel
* N to represent an unknown or unused channel
*
* For example, input_format="MMNR" indicates that the input data consists of four channels,
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
*
* @param input_format The input format
* @param filter_length The length of filter. The larger the filter, the higher the CPU loading.
* Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
*
* @return afe_config_t* The default config of afe
*/
afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);
/**
* @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
*
* @param inst The instance of AEC.
* @param indata Input audio data, format is define by input_format. Note indata will be modified in function call.
* @param outdata Returns near-end signal with echo removed.
* @return The bytes of outdata.
*/
size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
/**
* @brief Get frame size of AEC (the samples of one frame)
* @param handle The instance of AEC.
* @return Frame size
*/
int afe_aec_get_chunksize(afe_aec_handle_t *handle);
/**
* @brief Free the AEC instance
*
* @param inst The instance of AEC.
*
* @return None
*
*/
void afe_aec_destroy(afe_aec_handle_t *handel);
#ifdef __cplusplus
}
#endif
#endif //_ESP_AEC_H_

View File

@ -110,6 +110,8 @@ typedef struct {
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
// 1000 ms
int vad_delay_ms; // The delay of the first speech frame in ms, default: 128 ms
// If you find vad cache can not cover all speech, please increase this value.
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false

View File

@ -141,12 +141,12 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name);
/**
* @brief Enable VAD algorithm.
* @brief Reset one function/module/algorithm.
*
* @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled
* @return -1: fail, 1: success
*/
typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe);
typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe);
/**
* @brief Disable one function/module/algorithm.
@ -204,6 +204,7 @@ typedef struct {
esp_afe_sr_iface_op_enable_func_t enable_se;
esp_afe_sr_iface_op_disable_func_t disable_vad;
esp_afe_sr_iface_op_enable_func_t enable_vad;
esp_afe_sr_iface_op_reset_op_t reset_vad;
esp_afe_sr_iface_op_disable_func_t disable_ns;
esp_afe_sr_iface_op_enable_func_t enable_ns;
esp_afe_sr_iface_op_disable_func_t disable_agc;

View File

@ -1,6 +1,6 @@
#pragma once
#include <stdint.h>
#include "esp_speech_features.h"
#include <stdint.h>
/*
This describes an interface for a MFCC runner, that is, some kind of implementation that can be
@ -8,33 +8,30 @@ fed sample chunks and returns the MFCC cepstrum of those samples. This is an abs
multiple implementations can be used.
*/
typedef struct esp_mfcc_data_t esp_mfcc_data_t;
//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please
//refer to its documentation for details.
// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features),
// please refer to its documentation for details.
typedef struct {
int winstep_ms; // The step between successive windows in ms. (10)
int winlen_ms; // The length of the analysis window in ms. (25)
int nch; // The number of input channel
int numcep; // The number of cepstrum to return
int nfilter; // The number of filters in the filterbank
int nfft; // The FFT size
int samp_freq; // The sample-rate of the signal.
int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0)
int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
char *win_type; // Analysis window type to apply to each frame "hanning","hamming","sine","rectangular","povey"
bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
int winstep_ms; // The step between successive windows in ms. (10)
int winlen_ms; // The length of the analysis window in ms. (25)
int nch; // The number of input channel
int numcep; // The number of cepstrum to return
int nfilter; // The number of filters in the filterbank
int nfft; // The FFT size
int samp_freq; // The sample-rate of the signal.
int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0)
int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
char *win_type; // Analysis window type to apply to each frame "hanning","hamming","sine","rectangular","povey"
bool append_energy; //  If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum
int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
float log_epsilon; // log epsilon. (e.g. 1e-7)
float log_epsilon; // log epsilon. (e.g. 1e-7)
bool psram_first; // Alloc memory from PSRAM first
bool remove_dc_offset; // Whether to subtract mean of wave before FFT
bool remove_dc_offset; // Whether to subtract mean of wave before FFT
} esp_mfcc_opts_t;
/**
* @brief Un-initialize and free a mfcc runner
*
@ -54,13 +51,13 @@ typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
* @param opt Options for the mfcc process
* @return True if success, false on error.
*/
typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
/**
* @brief Run a mfcc iteration on frame by frame
*
* This will take a set of samples and return a ceptrum. Note that this may be pipelined:
* an initial call to this function may return NULL and subsequent calls may return the
* an initial call to this function may return NULL and subsequent calls may return the
* cepstrum of previous calls.
*
* @param r The mfcc runner
@ -69,7 +66,7 @@ typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
* when done with this buffer. Note that some implementations require the buffer to be freed before another call
* to this function is done.
*/
typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
/**
* @brief Clean all state of mfcc handle

View File

@ -1,18 +1,16 @@
#pragma once
#include "esp_mfcc_iface.h"
extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle
/**
* @brief Return basic opts used in wakenet9 & multinet5
**/
esp_mfcc_opts_t *get_mfcc_opts_wn9();
/**
* @brief Return basic opts for default kaldifeat
*
* @brief Return basic opts for default kaldifeat
*
opts->psram_first = true;
opts->use_power = true;
opts->use_log_fbank = 2; // log(max(x, log_epsilon))
@ -37,4 +35,4 @@ esp_mfcc_opts_t *get_mfcc_opts_kaldi();
/**
* @brief Print mfcc opts
**/
void print_mfcc_opts(esp_mfcc_opts_t *opts);
void print_mfcc_opts(esp_mfcc_opts_t *opts);

View File

@ -8,46 +8,45 @@
#define M_2PI 6.283185307179586476925286766559005
#endif
typedef struct
{
typedef struct {
float *coeff;
int *bank_pos;
int nfilter;
} esp_mel_filter_t;
float* esp_mfcc_malloc(size_t size, bool from_psram);
float *esp_mfcc_malloc(size_t size, bool from_psram);
void esp_mfcc_free(void *ptr);
/**
* @brief Initialize FFT table
* @warning For ESP-PLATFORM, use esp-dsp fft
* For Other platform, use kiss fft
*
* @param nfft The input samples number
* For Other platform, use kiss fft
*
* @param nfft The input samples number
* @return fft-table
**/
void* esp_fft_init(int nfft);
void *esp_fft_init(int nfft);
/**
* @brief Free FFT table
* @warning For ESP-PLATFORM, use esp-dsp fft
* For Other platform, use kiss fft
*
* For Other platform, use kiss fft
*
* @param fft_table The fft table initialized by esp_fft_init
* @param nfft The input samples number
* @param nfft The input samples number
* @return fft-table
**/
void esp_fft_deinit(void *fft_table, int nfft);
/**
* @brief Initial window function
* Currently support hanning, hamming, sine, povey, rectangular,
* Currently support hanning, hamming, sine, povey, rectangular,
* wn9(512-hanning to get wakenet9& multinet5 compatible)
**/
float *esp_win_func_init(char *win_type, float* window_data, int frame_length);
float *esp_win_func_init(char *win_type, float *window_data, int frame_length);
float* esp_fftr(float* x, int nfft, void *fft_table);
float *esp_fftr(float *x, int nfft, void *fft_table);
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
@ -55,10 +54,9 @@ void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);
esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq,
bool from_psram);
esp_mel_filter_t *esp_mel_filter_init(
int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, bool from_psram);
void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);
float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank,
float epsilon);
float *esp_mel_dotprod_step(float *x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, float epsilon);

View File

@ -20,19 +20,19 @@
extern "C" {
#endif
#define SAMPLE_RATE_HZ 16000 //Supports 32000, 16000, 8000
#define VAD_FRAME_LENGTH_MS 30 //Supports 10ms, 20ms, 30ms
#define SAMPLE_RATE_HZ 16000 // Supports 32000, 16000, 8000
#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms
/**
* @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
* restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
*/
typedef enum {
VAD_MODE_0 = 0, // Normal
VAD_MODE_1, // Aggressive
VAD_MODE_2, // Very Aggressive
VAD_MODE_3, // Very Very Aggressive
VAD_MODE_4 // Very Very Very Aggressive
VAD_MODE_0 = 0, // Normal
VAD_MODE_1, // Aggressive
VAD_MODE_2, // Very Aggressive
VAD_MODE_3, // Very Very Aggressive
VAD_MODE_4 // Very Very Very Aggressive
} vad_mode_t;
typedef enum {
@ -51,10 +51,10 @@ typedef struct vad_trigger_tag {
#define vad_MAX_LEN INT32_MAX - 1
/**
* @brief Allocate wakenet trigger
*
*
* @param min_speech_len Minimum frame number of speech duration
* @param min_noise_len Minimum frame number of noise duration
*
*
* @return Trigger pointer
**/
vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
@ -74,19 +74,17 @@ void vad_trigger_reset(vad_trigger_t *trigger);
**/
vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
typedef struct {
vad_trigger_t *trigger;
void *vad_inst;
int sample_rate;
int frame_size;
}vad_handle_with_trigger_t;
} vad_handle_with_trigger_t;
typedef vad_handle_with_trigger_t* vad_handle_t;
typedef vad_handle_with_trigger_t *vad_handle_t;
// typedef vad_handle_tag * vad_handle_t;
/**
* @brief Creates an instance to the VAD structure.
*
@ -110,7 +108,8 @@ vad_handle_t vad_create(vad_mode_t vad_mode);
* - NULL: Create failed
* - Others: The instance of VAD
*/
vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len);
vad_handle_t vad_create_with_param(
vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
/**
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
@ -138,6 +137,13 @@ vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz,
*/
vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
/**
* @brief Reset trigger state as Silence
*
* @param handle The instance of VAD.
*/
void vad_reset_trigger(vad_handle_t handle);
/**
* @brief Free the VAD instance
*
@ -149,20 +155,21 @@ vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
void vad_destroy(vad_handle_t inst);
/*
* Programming Guide:
*
* @code{c}
* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to the VAD structure.
*
* while (1) {
* //Use buffer to receive the audio data from MIC.
* vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result.
* }
*
* vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process
*
* @endcode
*/
* Programming Guide:
*
* @code{c}
* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to
* the VAD structure.
*
* while (1) {
* //Use buffer to receive the audio data from MIC.
* vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result.
* }
*
* vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process
*
* @endcode
*/
#ifdef __cplusplus
}

View File

@ -0,0 +1,82 @@
#ifndef _ESP_AFE_AEC_H_
#define _ESP_AFE_AEC_H_
#include "esp_afe_config.h"
#include "esp_aec.h"
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
aec_handle_t* handle;
aec_mode_t mode;
afe_pcm_config_t pcm_config;
int frame_size;
int16_t *data;
}afe_aec_handle_t;
/**
* @brief Creates an instance to the AEC structure.
*
* @warning Currently only support 1 microphone channel and 1 playback channe.
* If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
*
* The input format, same as afe config:
* M to represent the microphone channel
* R to represent the playback reference channel
* N to represent an unknown or unused channel
*
* For example, input_format="MMNR" indicates that the input data consists of four channels,
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
*
* @param input_format The input format
* @param filter_length The length of filter. The larger the filter, the higher the CPU loading.
* Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
*
* @return afe_config_t* The default config of afe
*/
afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);
/**
* @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
*
* @param inst The instance of AEC.
* @param indata Input audio data, format is define by input_format. Note indata will be modified in function call.
* @param outdata Returns near-end signal with echo removed.
* @return The bytes of outdata.
*/
size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
/**
* @brief Get frame size of AEC (the samples of one frame)
* @param handle The instance of AEC.
* @return Frame size
*/
int afe_aec_get_chunksize(afe_aec_handle_t *handle);
/**
* @brief Free the AEC instance
*
* @param inst The instance of AEC.
*
* @return None
*
*/
void afe_aec_destroy(afe_aec_handle_t *handel);
#ifdef __cplusplus
}
#endif
#endif //_ESP_AEC_H_

View File

@ -0,0 +1,69 @@
#pragma once
#include "esp_aec.h"
#include "stdbool.h"
#include "stdint.h"
#include "stdlib.h"
#ifdef __cplusplus
extern "C" {
#endif
// AFE: Audio Front-End
// SR: Speech Recognition
// VC: Voice Communication
// Set AFE_SR mode
typedef enum {
SR_MODE_LOW_COST = 0, // Deprecated, please use afe_mode_t, AFE mode: low cost mode
SR_MODE_HIGH_PERF = 1, // Deprecated, please use afe_mode_t, AFE mode: high performance mode
} afe_sr_mode_t;
// Set AFE mode
typedef enum {
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
} afe_mode_t;
// Set AFE type
typedef enum {
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
} afe_type_t;
typedef enum {
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
AFE_MEMORY_ALLOC_MORE_PSRAM = 3 // malloc with more psram
} afe_memory_alloc_mode_t;
typedef enum {
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
} afe_mn_peak_agc_mode_t;
typedef struct {
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
int mic_num; // microphone channel number
uint8_t *mic_ids; // microphone channel indices
int ref_num; // playback reference channel number
uint8_t *ref_ids; // playback reference channel indices
int sample_rate; // sample rate of audio
} afe_pcm_config_t;
typedef enum {
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
AFE_NS_MODE_NET = 1, // please use model name of NSNET
} afe_ns_mode_t;
typedef enum {
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
} afe_agc_mode_t;
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,82 @@
#ifndef _ESP_AFE_AEC_H_
#define _ESP_AFE_AEC_H_
#include "esp_afe_config.h"
#include "esp_aec.h"
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
aec_handle_t* handle;
aec_mode_t mode;
afe_pcm_config_t pcm_config;
int frame_size;
int16_t *data;
}afe_aec_handle_t;
/**
* @brief Creates an instance to the AEC structure.
*
* @warning Currently only support 1 microphone channel and 1 playback channe.
* If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
*
* The input format, same as afe config:
* M to represent the microphone channel
* R to represent the playback reference channel
* N to represent an unknown or unused channel
*
* For example, input_format="MMNR" indicates that the input data consists of four channels,
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
*
* @param input_format The input format
* @param filter_length The length of filter. The larger the filter, the higher the CPU loading.
* Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
*
* @return afe_config_t* The default config of afe
*/
afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);
/**
* @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
*
* @param inst The instance of AEC.
* @param indata Input audio data, format is define by input_format. Note indata will be modified in function call.
* @param outdata Returns near-end signal with echo removed.
* @return The bytes of outdata.
*/
size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
/**
* @brief Get frame size of AEC (the samples of one frame)
* @param handle The instance of AEC.
* @return Frame size
*/
int afe_aec_get_chunksize(afe_aec_handle_t *handle);
/**
* @brief Free the AEC instance
*
* @param inst The instance of AEC.
*
* @return None
*
*/
void afe_aec_destroy(afe_aec_handle_t *handel);
#ifdef __cplusplus
}
#endif
#endif //_ESP_AEC_H_

View File

@ -1,6 +1,6 @@
#pragma once
#include <stdint.h>
#include "esp_speech_features.h"
#include <stdint.h>
/*
This describes an interface for a MFCC runner, that is, some kind of implementation that can be
@ -8,33 +8,30 @@ fed sample chunks and returns the MFCC cepstrum of those samples. This is an abs
multiple implementations can be used.
*/
typedef struct esp_mfcc_data_t esp_mfcc_data_t;
//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please
//refer to its documentation for details.
// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features),
// please refer to its documentation for details.
typedef struct {
int winstep_ms; // The step between successive windows in ms. (10)
int winlen_ms; // The length of the analysis window in ms. (25)
int nch; // The number of input channel
int numcep; // The number of cepstrum to return
int nfilter; // The number of filters in the filterbank
int nfft; // The FFT size
int samp_freq; // The sample-rate of the signal.
int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0)
int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
char *win_type; // Analysis window type to apply to each frame "hanning","hamming","sine","rectangular","povey"
bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
int winstep_ms; // The step between successive windows in ms. (10)
int winlen_ms; // The length of the analysis window in ms. (25)
int nch; // The number of input channel
int numcep; // The number of cepstrum to return
int nfilter; // The number of filters in the filterbank
int nfft; // The FFT size
int samp_freq; // The sample-rate of the signal.
int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0)
int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
char *win_type; // Analysis window type to apply to each frame "hanning","hamming","sine","rectangular","povey"
bool append_energy; //  If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum
int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
float log_epsilon; // log epsilon. (e.g. 1e-7)
float log_epsilon; // log epsilon. (e.g. 1e-7)
bool psram_first; // Alloc memory from PSRAM first
bool remove_dc_offset; // Whether to subtract mean of wave before FFT
bool remove_dc_offset; // Whether to subtract mean of wave before FFT
} esp_mfcc_opts_t;
/**
* @brief Un-initialize and free a mfcc runner
*
@ -54,13 +51,13 @@ typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
* @param opt Options for the mfcc process
* @return True if success, false on error.
*/
typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
/**
* @brief Run a mfcc iteration on frame by frame
*
* This will take a set of samples and return a ceptrum. Note that this may be pipelined:
* an initial call to this function may return NULL and subsequent calls may return the
* an initial call to this function may return NULL and subsequent calls may return the
* cepstrum of previous calls.
*
* @param r The mfcc runner
@ -69,7 +66,7 @@ typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
* when done with this buffer. Note that some implementations require the buffer to be freed before another call
* to this function is done.
*/
typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
/**
* @brief Clean all state of mfcc handle

View File

@ -1,18 +1,16 @@
#pragma once
#include "esp_mfcc_iface.h"
extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle
/**
* @brief Return basic opts used in wakenet9 & multinet5
**/
esp_mfcc_opts_t *get_mfcc_opts_wn9();
/**
* @brief Return basic opts for default kaldifeat
*
* @brief Return basic opts for default kaldifeat
*
opts->psram_first = true;
opts->use_power = true;
opts->use_log_fbank = 2; // log(max(x, log_epsilon))
@ -37,4 +35,4 @@ esp_mfcc_opts_t *get_mfcc_opts_kaldi();
/**
* @brief Print mfcc opts
**/
void print_mfcc_opts(esp_mfcc_opts_t *opts);
void print_mfcc_opts(esp_mfcc_opts_t *opts);

View File

@ -8,46 +8,45 @@
#define M_2PI 6.283185307179586476925286766559005
#endif
typedef struct
{
typedef struct {
float *coeff;
int *bank_pos;
int nfilter;
} esp_mel_filter_t;
float* esp_mfcc_malloc(size_t size, bool from_psram);
float *esp_mfcc_malloc(size_t size, bool from_psram);
void esp_mfcc_free(void *ptr);
/**
* @brief Initialize FFT table
* @warning For ESP-PLATFORM, use esp-dsp fft
* For Other platform, use kiss fft
*
* @param nfft The input samples number
* For Other platform, use kiss fft
*
* @param nfft The input samples number
* @return fft-table
**/
void* esp_fft_init(int nfft);
void *esp_fft_init(int nfft);
/**
* @brief Free FFT table
* @warning For ESP-PLATFORM, use esp-dsp fft
* For Other platform, use kiss fft
*
* For Other platform, use kiss fft
*
* @param fft_table The fft table initialized by esp_fft_init
* @param nfft The input samples number
* @param nfft The input samples number
* @return fft-table
**/
void esp_fft_deinit(void *fft_table, int nfft);
/**
* @brief Initial window function
* Currently support hanning, hamming, sine, povey, rectangular,
* Currently support hanning, hamming, sine, povey, rectangular,
* wn9(512-hanning to get wakenet9& multinet5 compatible)
**/
float *esp_win_func_init(char *win_type, float* window_data, int frame_length);
float *esp_win_func_init(char *win_type, float *window_data, int frame_length);
float* esp_fftr(float* x, int nfft, void *fft_table);
float *esp_fftr(float *x, int nfft, void *fft_table);
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
@ -55,10 +54,9 @@ void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);
esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq,
bool from_psram);
esp_mel_filter_t *esp_mel_filter_init(
int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, bool from_psram);
void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);
float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank,
float epsilon);
float *esp_mel_dotprod_step(float *x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, float epsilon);

View File

@ -20,19 +20,19 @@
extern "C" {
#endif
#define SAMPLE_RATE_HZ 16000 //Supports 32000, 16000, 8000
#define VAD_FRAME_LENGTH_MS 30 //Supports 10ms, 20ms, 30ms
#define SAMPLE_RATE_HZ 16000 // Supports 32000, 16000, 8000
#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms
/**
* @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
* restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
*/
typedef enum {
VAD_MODE_0 = 0, // Normal
VAD_MODE_1, // Aggressive
VAD_MODE_2, // Very Aggressive
VAD_MODE_3, // Very Very Aggressive
VAD_MODE_4 // Very Very Very Aggressive
VAD_MODE_0 = 0, // Normal
VAD_MODE_1, // Aggressive
VAD_MODE_2, // Very Aggressive
VAD_MODE_3, // Very Very Aggressive
VAD_MODE_4 // Very Very Very Aggressive
} vad_mode_t;
typedef enum {
@ -51,10 +51,10 @@ typedef struct vad_trigger_tag {
#define vad_MAX_LEN INT32_MAX - 1
/**
* @brief Allocate wakenet trigger
*
*
* @param min_speech_len Minimum frame number of speech duration
* @param min_noise_len Minimum frame number of noise duration
*
*
* @return Trigger pointer
**/
vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
@ -74,19 +74,17 @@ void vad_trigger_reset(vad_trigger_t *trigger);
**/
vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
typedef struct {
vad_trigger_t *trigger;
void *vad_inst;
int sample_rate;
int frame_size;
}vad_handle_with_trigger_t;
} vad_handle_with_trigger_t;
typedef vad_handle_with_trigger_t* vad_handle_t;
typedef vad_handle_with_trigger_t *vad_handle_t;
// typedef vad_handle_tag * vad_handle_t;
/**
* @brief Creates an instance to the VAD structure.
*
@ -110,7 +108,8 @@ vad_handle_t vad_create(vad_mode_t vad_mode);
* - NULL: Create failed
* - Others: The instance of VAD
*/
vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
vad_handle_t vad_create_with_param(
vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
/**
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
@ -156,20 +155,21 @@ void vad_reset_trigger(vad_handle_t handle);
void vad_destroy(vad_handle_t inst);
/*
* Programming Guide:
*
* @code{c}
* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to the VAD structure.
*
* while (1) {
* //Use buffer to receive the audio data from MIC.
* vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result.
* }
*
* vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process
*
* @endcode
*/
* Programming Guide:
*
* @code{c}
* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to
* the VAD structure.
*
* while (1) {
* //Use buffer to receive the audio data from MIC.
* vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result.
* }
*
* vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process
*
* @endcode
*/
#ifdef __cplusplus
}

View File

@ -0,0 +1,82 @@
#ifndef _ESP_AFE_AEC_H_
#define _ESP_AFE_AEC_H_
#include "esp_afe_config.h"
#include "esp_aec.h"
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef struct {
aec_handle_t* handle;
aec_mode_t mode;
afe_pcm_config_t pcm_config;
int frame_size;
int16_t *data;
}afe_aec_handle_t;
/**
* @brief Creates an instance to the AEC structure.
*
* @warning Currently only support 1 microphone channel and 1 playback channe.
* If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
*
* The input format, same as afe config:
* M to represent the microphone channel
* R to represent the playback reference channel
* N to represent an unknown or unused channel
*
* For example, input_format="MMNR" indicates that the input data consists of four channels,
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
*
* @param input_format The input format
* @param filter_length The length of filter. The larger the filter, the higher the CPU loading.
* Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
*
* @return afe_config_t* The default config of afe
*/
afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);
/**
* @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
*
* @param inst The instance of AEC.
* @param indata Input audio data, format is define by input_format. Note indata will be modified in function call.
* @param outdata Returns near-end signal with echo removed.
* @return The bytes of outdata.
*/
size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
/**
* @brief Get frame size of AEC (the samples of one frame)
* @param handle The instance of AEC.
* @return Frame size
*/
int afe_aec_get_chunksize(afe_aec_handle_t *handle);
/**
* @brief Free the AEC instance
*
* @param inst The instance of AEC.
*
* @return None
*
*/
void afe_aec_destroy(afe_aec_handle_t *handel);
#ifdef __cplusplus
}
#endif
#endif //_ESP_AEC_H_

View File

@ -1,6 +1,6 @@
#pragma once
#include <stdint.h>
#include "esp_speech_features.h"
#include <stdint.h>
/*
This describes an interface for a MFCC runner, that is, some kind of implementation that can be
@ -8,33 +8,30 @@ fed sample chunks and returns the MFCC cepstrum of those samples. This is an abs
multiple implementations can be used.
*/
typedef struct esp_mfcc_data_t esp_mfcc_data_t;
//Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features), please
//refer to its documentation for details.
// Options for the mfcc algorithm itself. These more-or-less match the parameters of csf_mfcc (from c_speech_features),
// please refer to its documentation for details.
typedef struct {
int winstep_ms; // The step between successive windows in ms. (10)
int winlen_ms; // The length of the analysis window in ms. (25)
int nch; // The number of input channel
int numcep; // The number of cepstrum to return
int nfilter; // The number of filters in the filterbank
int nfft; // The FFT size
int samp_freq; // The sample-rate of the signal.
int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0)
int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
char *win_type; // Analysis window type to apply to each frame "hanning","hamming","sine","rectangular","povey"
bool append_energy; // If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
int winstep_ms; // The step between successive windows in ms. (10)
int winlen_ms; // The length of the analysis window in ms. (25)
int nch; // The number of input channel
int numcep; // The number of cepstrum to return
int nfilter; // The number of filters in the filterbank
int nfft; // The FFT size
int samp_freq; // The sample-rate of the signal.
int low_freq; // The lowest band edge of mel filters, in hz. (e.g. 0)
int high_freq; // The highest band edge of mel filters, in hz. Must not be higher than samp_freq
float preemph; // Preemphasis filter coefficient. 0 is no filter. (e.g. 0.97)
char *win_type; // Analysis window type to apply to each frame "hanning","hamming","sine","rectangular","povey"
bool append_energy; //  If true, the zeroth cepstral coefficient is replaced with the log of the total frame energy
bool use_power; // If true, use power of fft spectrum, else use magnitude of fft spectrum
int use_log_fbank; // 0: return fbank, 1: return log(x+log_epsilon), 2: return log(max(x, log_epsilon))
float log_epsilon; // log epsilon. (e.g. 1e-7)
float log_epsilon; // log epsilon. (e.g. 1e-7)
bool psram_first; // Alloc memory from PSRAM first
bool remove_dc_offset; // Whether to subtract mean of wave before FFT
bool remove_dc_offset; // Whether to subtract mean of wave before FFT
} esp_mfcc_opts_t;
/**
* @brief Un-initialize and free a mfcc runner
*
@ -54,13 +51,13 @@ typedef void (*esp_mfcc_op_destroy_t)(esp_mfcc_data_t *r);
* @param opt Options for the mfcc process
* @return True if success, false on error.
*/
typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
typedef esp_mfcc_data_t *(*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
/**
* @brief Run a mfcc iteration on frame by frame
*
* This will take a set of samples and return a ceptrum. Note that this may be pipelined:
* an initial call to this function may return NULL and subsequent calls may return the
* an initial call to this function may return NULL and subsequent calls may return the
* cepstrum of previous calls.
*
* @param r The mfcc runner
@ -69,7 +66,7 @@ typedef esp_mfcc_data_t* (*esp_mfcc_op_create_t)(const esp_mfcc_opts_t *opt);
* when done with this buffer. Note that some implementations require the buffer to be freed before another call
* to this function is done.
*/
typedef float* (*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
typedef float *(*esp_mfcc_op_run_step_t)(esp_mfcc_data_t *r, int16_t *samp, int16_t nch);
/**
* @brief Clean all state of mfcc handle

View File

@ -1,18 +1,16 @@
#pragma once
#include "esp_mfcc_iface.h"
extern const esp_mfcc_iface_t esp_fbank_f32; // float32-fbank handle
/**
* @brief Return basic opts used in wakenet9 & multinet5
**/
esp_mfcc_opts_t *get_mfcc_opts_wn9();
/**
* @brief Return basic opts for default kaldifeat
*
* @brief Return basic opts for default kaldifeat
*
opts->psram_first = true;
opts->use_power = true;
opts->use_log_fbank = 2; // log(max(x, log_epsilon))
@ -37,4 +35,4 @@ esp_mfcc_opts_t *get_mfcc_opts_kaldi();
/**
* @brief Print mfcc opts
**/
void print_mfcc_opts(esp_mfcc_opts_t *opts);
void print_mfcc_opts(esp_mfcc_opts_t *opts);

View File

@ -8,46 +8,45 @@
#define M_2PI 6.283185307179586476925286766559005
#endif
typedef struct
{
typedef struct {
float *coeff;
int *bank_pos;
int nfilter;
} esp_mel_filter_t;
float* esp_mfcc_malloc(size_t size, bool from_psram);
float *esp_mfcc_malloc(size_t size, bool from_psram);
void esp_mfcc_free(void *ptr);
/**
* @brief Initialize FFT table
* @warning For ESP-PLATFORM, use esp-dsp fft
* For Other platform, use kiss fft
*
* @param nfft The input samples number
* For Other platform, use kiss fft
*
* @param nfft The input samples number
* @return fft-table
**/
void* esp_fft_init(int nfft);
void *esp_fft_init(int nfft);
/**
* @brief Free FFT table
* @warning For ESP-PLATFORM, use esp-dsp fft
* For Other platform, use kiss fft
*
* For Other platform, use kiss fft
*
* @param fft_table The fft table initialized by esp_fft_init
* @param nfft The input samples number
* @param nfft The input samples number
* @return fft-table
**/
void esp_fft_deinit(void *fft_table, int nfft);
/**
* @brief Initial window function
* Currently support hanning, hamming, sine, povey, rectangular,
* Currently support hanning, hamming, sine, povey, rectangular,
* wn9(512-hanning to get wakenet9& multinet5 compatible)
**/
float *esp_win_func_init(char *win_type, float* window_data, int frame_length);
float *esp_win_func_init(char *win_type, float *window_data, int frame_length);
float* esp_fftr(float* x, int nfft, void *fft_table);
float *esp_fftr(float *x, int nfft, void *fft_table);
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
@ -55,10 +54,9 @@ void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
float *esp_preemphasis_step(float *x, unsigned int len, float coeff, float last);
esp_mel_filter_t *esp_mel_filter_init(int nfft, int nfilter, int low_freq, int high_freq, int samp_freq,
bool from_psram);
esp_mel_filter_t *esp_mel_filter_init(
int nfft, int nfilter, int low_freq, int high_freq, int samp_freq, bool from_psram);
void esp_mel_filter_deinit(esp_mel_filter_t *mel_filter);
float* esp_mel_dotprod_step(float* x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank,
float epsilon);
float *esp_mel_dotprod_step(float *x, float *out, esp_mel_filter_t *mel_filter, int use_log_fbank, float epsilon);

View File

@ -20,19 +20,19 @@
extern "C" {
#endif
#define SAMPLE_RATE_HZ 16000 //Supports 32000, 16000, 8000
#define VAD_FRAME_LENGTH_MS 30 //Supports 10ms, 20ms, 30ms
#define SAMPLE_RATE_HZ 16000 // Supports 32000, 16000, 8000
#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms
/**
* @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
* restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
*/
typedef enum {
VAD_MODE_0 = 0, // Normal
VAD_MODE_1, // Aggressive
VAD_MODE_2, // Very Aggressive
VAD_MODE_3, // Very Very Aggressive
VAD_MODE_4 // Very Very Very Aggressive
VAD_MODE_0 = 0, // Normal
VAD_MODE_1, // Aggressive
VAD_MODE_2, // Very Aggressive
VAD_MODE_3, // Very Very Aggressive
VAD_MODE_4 // Very Very Very Aggressive
} vad_mode_t;
typedef enum {
@ -51,10 +51,10 @@ typedef struct vad_trigger_tag {
#define vad_MAX_LEN INT32_MAX - 1
/**
* @brief Allocate wakenet trigger
*
*
* @param min_speech_len Minimum frame number of speech duration
* @param min_noise_len Minimum frame number of noise duration
*
*
* @return Trigger pointer
**/
vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
@ -74,19 +74,17 @@ void vad_trigger_reset(vad_trigger_t *trigger);
**/
vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
typedef struct {
vad_trigger_t *trigger;
void *vad_inst;
int sample_rate;
int frame_size;
}vad_handle_with_trigger_t;
} vad_handle_with_trigger_t;
typedef vad_handle_with_trigger_t* vad_handle_t;
typedef vad_handle_with_trigger_t *vad_handle_t;
// typedef vad_handle_tag * vad_handle_t;
/**
* @brief Creates an instance to the VAD structure.
*
@ -110,7 +108,8 @@ vad_handle_t vad_create(vad_mode_t vad_mode);
* - NULL: Create failed
* - Others: The instance of VAD
*/
vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
vad_handle_t vad_create_with_param(
vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
/**
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
@ -156,20 +155,21 @@ void vad_reset_trigger(vad_handle_t handle);
void vad_destroy(vad_handle_t inst);
/*
* Programming Guide:
*
* @code{c}
* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to the VAD structure.
*
* while (1) {
* //Use buffer to receive the audio data from MIC.
* vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result.
* }
*
* vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process
*
* @endcode
*/
* Programming Guide:
*
* @code{c}
* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to
* the VAD structure.
*
* while (1) {
* //Use buffer to receive the audio data from MIC.
* vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result.
* }
*
* vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process
*
* @endcode
*/
#ifdef __cplusplus
}

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -18,6 +18,7 @@
#include "esp_wn_models.h"
#include "esp_afe_sr_models.h"
#include "dl_lib_convq_queue.h"
#include "esp_afe_aec.h"
#include <sys/time.h>
#if (CONFIG_IDF_TARGET_ESP32S3 || CONFIG_IDF_TARGET_ESP32P4)
@ -297,4 +298,25 @@ TEST_CASE("afe performance test (2ch)", "[afe_perf]")
afe_config_free(afe_config);
}
esp_srmodel_deinit(models);
}
TEST_CASE("test afe aec interface", "[afe]")
{
int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
afe_aec_handle_t *handle = afe_aec_create("MNR", 4, AFE_TYPE_SR, AFE_MODE_HIGH_PERF);
int frame_bytes = handle->frame_size * sizeof(int16_t);
int16_t *indata = (int16_t *) malloc(frame_bytes*handle->pcm_config.total_ch_num);
int16_t *outdata = (int16_t *) malloc(frame_bytes);
afe_aec_process(handle, indata, outdata);
afe_aec_process(handle, indata, outdata);
afe_aec_process(handle, indata, outdata);
afe_aec_destroy(handle);
free(indata);
free(outdata);
int end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
TEST_ASSERT_EQUAL(true, end_size == start_size);
}

View File

@ -12,10 +12,64 @@
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#include "esp_aec.h"
#include "esp_afe_aec.h"
#include "audio_test_file.h"
#include "unity.h"
#include "esp_timer.h"
TEST_CASE("test esp32c5 afe aec interface", "[aec]")
{
// vad_handle_t vad_handle = (vad_handle_t)arg;
heap_caps_print_heap_info(MALLOC_CAP_8BIT);
int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
int start_internal_size = heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
int sample_rate = 16000;
afe_aec_handle_t *aec_handle = afe_aec_create("MR", 2, AFE_TYPE_SR, AFE_MODE_LOW_COST);
afe_aec_destroy(aec_handle);
int first_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
printf("memory leak for first init: %d\n", start_size - first_end_size);
aec_handle = afe_aec_create("MR", 2, AFE_TYPE_SR, AFE_MODE_LOW_COST);
int audio_chunksize = afe_aec_get_chunksize(aec_handle);
printf("audio chunksize:%d\n", audio_chunksize); //512
int16_t *buffer = (int16_t *)malloc(audio_chunksize * sizeof(int16_t)*2);
int16_t *out_buffer = (int16_t *)malloc(audio_chunksize * sizeof(int16_t));
int chunks = 0;
uint32_t c0, c1, c_res = 0;
while (1) {
if ((chunks + 1)*audio_chunksize * sizeof(int16_t) <= sizeof(audio_mic_file)) {
memcpy(buffer, audio_mic_file + chunks * audio_chunksize , audio_chunksize * sizeof(int16_t));
memcpy(buffer+audio_chunksize, audio_ref_file + chunks * audio_chunksize , audio_chunksize * sizeof(int16_t));
} else {
break;
}
c0 = esp_timer_get_time();
afe_aec_process(aec_handle, buffer, out_buffer);
c1 = esp_timer_get_time();
c_res += c1 - c0;
chunks++;
}
free(buffer);
free(out_buffer);
printf("RAM size after vad detection: total:%d, internal:%d\n",
start_size - heap_caps_get_free_size(MALLOC_CAP_8BIT),
start_internal_size - heap_caps_get_free_size(MALLOC_CAP_INTERNAL));
printf("Done! Took %ld ms to parse %d ms worth of samples in %d iterations.\n",
c_res/1000, chunks*audio_chunksize*1000/sample_rate, chunks);
afe_aec_destroy(aec_handle);
int end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
printf("memory leak:%d\n", start_size-end_size);
TEST_ASSERT_EQUAL(true, end_size == start_size);
}
TEST_CASE("test esp32c5 aec", "[aec]")
{
// vad_handle_t vad_handle = (vad_handle_t)arg;
@ -68,3 +122,6 @@ TEST_CASE("test esp32c5 aec", "[aec]")
printf("memory leak:%d\n", start_size-end_size);
TEST_ASSERT_EQUAL(true, end_size == start_size);
}