mirror of
https://github.com/espressif/esp-sr.git
synced 2025-09-15 15:28:44 +08:00
feat: update esp32 AFE interface
This commit is contained in:
parent
247d7eecf6
commit
284b382189
@ -114,9 +114,8 @@ before_script:
|
||||
expire_in: 1 week
|
||||
variables:
|
||||
EXAMPLES_PATH: "test_apps"
|
||||
TARGET: "esp32s3"
|
||||
script:
|
||||
- python ./test_apps/build_apps.py $EXAMPLES_PATH -t $TARGET
|
||||
- python ./test_apps/build_apps.py $EXAMPLES_PATH -t all
|
||||
|
||||
build_esp_sr:
|
||||
extends:
|
||||
@ -125,7 +124,6 @@ build_esp_sr:
|
||||
parallel:
|
||||
matrix:
|
||||
- IMAGE: [espressif/idf:release-v5.3, espressif/idf:latest]
|
||||
TARGET: ["esp32s3", "esp32p4"]
|
||||
EXAMPLES_PATH: "test_apps/esp-sr"
|
||||
|
||||
|
||||
@ -136,7 +134,6 @@ build_esp_tts:
|
||||
parallel:
|
||||
matrix:
|
||||
- IMAGE: [espressif/idf:release-v5.3, espressif/idf:latest]
|
||||
TARGET: ["esp32s3", "esp32p4"]
|
||||
EXAMPLES_PATH: "test_apps/esp-tts"
|
||||
|
||||
.test_template: &test_template
|
||||
@ -183,8 +180,6 @@ test_esp_sr:
|
||||
matrix:
|
||||
- TEST_TARGET: esp32s3
|
||||
TEST_ENV: esp32s3
|
||||
- TEST_TARGET: esp32p4
|
||||
TEST_ENV: esp32p4
|
||||
tags:
|
||||
- ${TEST_ENV}
|
||||
image: $DOCKER_TARGET_TEST_v5_3_ENV_IMAGE
|
||||
@ -320,4 +315,4 @@ push_to_github:
|
||||
- echo -e "Host github.com\n\tStrictHostKeyChecking no\n" >> ~/.ssh/config
|
||||
- git remote remove github &>/dev/null || true
|
||||
- git remote add github git@github.com:espressif/esp-sr.git
|
||||
- git push github "${CI_COMMIT_SHA}:refs/heads/${CI_COMMIT_REF_NAME}"
|
||||
- git push github "${CI_COMMIT_SHA}:refs/heads/${CI_COMMIT_REF_NAME}"
|
||||
@ -21,80 +21,72 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
#define USE_AEC_FFT // Not kiss_fft
|
||||
#define AEC_USE_SPIRAM 0
|
||||
#define AEC_SAMPLE_RATE 16000 // Only Support 16000Hz
|
||||
//#define AEC_FRAME_LENGTH_MS 16
|
||||
#define AEC_FRAME_LENGTH_MS 32
|
||||
#define AEC_FILTER_LENGTH 1200 // Number of samples of echo to cancel
|
||||
|
||||
typedef void* aec_handle_t;
|
||||
typedef struct aec_handle_t aec_handle_t;
|
||||
typedef enum {
|
||||
AEC_MODE_SR_LOW_COST = 0, // Low Cost AEC fro speech recognition
|
||||
AEC_MODE_SR_HIGH_PERF = 1, // High Perforamce AEC for speech recognition
|
||||
AEC_MODE_VOIP_LOW_COST = 3, // Low Cost AEC for voice communication
|
||||
AEC_MODE_VOIP_HIGH_PERF = 4, // High Perforamce AEC for voice communication
|
||||
} aec_mode_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the AEC structure.
|
||||
* Please get frame size by aec_get_chunksize() function
|
||||
*
|
||||
* @deprecated This API will be deprecated after version 1.0, please use aec_pro_create
|
||||
*
|
||||
* @param sample_rate The Sampling frequency (Hz) must be 16000.
|
||||
*
|
||||
* @param frame_length The length of the audio processing must be 16ms.
|
||||
*
|
||||
* @param filter_length Number of samples of echo to cancel.
|
||||
*
|
||||
* @param filter_length Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
|
||||
* @param channel_num The input microphone channel number
|
||||
* @param mode The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of AEC
|
||||
*/
|
||||
aec_handle_t aec_create(int sample_rate, int frame_length, int filter_length);
|
||||
aec_handle_t *aec_create(int sample_rate, int filter_length, int channel_num, aec_mode_t mode);
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the AEC structure.
|
||||
* @brief Creates an instance to the AEC structure, same with aec_create().
|
||||
*
|
||||
* @deprecated This API will be deprecated after version 1.0, please use aec_pro_create
|
||||
*
|
||||
* @param sample_rate The Sampling frequency (Hz) must be 16000.
|
||||
*
|
||||
* @param frame_length The length of the audio processing must be 16ms.
|
||||
*
|
||||
* @param filter_length Number of samples of echo to cancel.
|
||||
*
|
||||
* @param nch Number of input signal channel.
|
||||
*
|
||||
* @param filter_length Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
|
||||
* @param channel_num The input microphone channel number
|
||||
* @param mode The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of AEC
|
||||
*/
|
||||
aec_handle_t aec_create_multimic(int sample_rate, int frame_length, int filter_length, int nch);
|
||||
|
||||
/**
|
||||
* @brief Creates an instance of more powerful AEC.
|
||||
*
|
||||
* @param frame_length Length of input signal. Must be 16ms if mode is 0; otherwise could be 16ms or 32ms. Length of input signal to aec_process must be modified accordingly.
|
||||
*
|
||||
* @param nch Number of microphones.
|
||||
*
|
||||
* @param mode Mode of AEC (0 to 5), indicating aggressiveness and RAM allocation. 0: mild; 1 or 2: medium (1: internal RAM, 2: SPIRAM); 3 and 4: aggressive (3: internal RAM, 4: SPIRAM); 5: agressive, accelerated for ESP32-S3.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: An Instance of AEC
|
||||
*/
|
||||
aec_handle_t aec_pro_create(int frame_length, int nch, int mode);
|
||||
aec_handle_t *aec_pro_create(int filter_length, int channel_num, aec_mode_t mode);
|
||||
|
||||
/**
|
||||
* @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
|
||||
*
|
||||
* @param inst The instance of AEC.
|
||||
*
|
||||
* @warning The indata, refdata and outdata must be 16-bit signed. please allocate memory by heap_caps_aligned_alloc().
|
||||
*
|
||||
* @param inst The instance of AEC. Format for multi-channel data is "ch0 ch0 ch0 ..., ch1 ch1 ch1 ..."
|
||||
* @param indata An array of 16-bit signed audio samples from mic.
|
||||
*
|
||||
* @param refdata An array of 16-bit signed audio samples sent to the speaker.
|
||||
*
|
||||
* @param outdata Returns near-end signal with echo removed.
|
||||
*
|
||||
* @param outdata Returns near-end signal with echo removed. Format for multi-channel data is "ch0 ch0 ch0..., ch1 ch1 ch1 ..."
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int16_t *outdata);
|
||||
void aec_process(const aec_handle_t *handel, int16_t *indata, int16_t *refdata, int16_t *outdata);
|
||||
|
||||
/**
|
||||
* @brief Get frame size of AEC (the samples of one frame)
|
||||
* @param handle The instance of AEC.
|
||||
* @return Frame size
|
||||
*/
|
||||
int aec_get_chunksize(const aec_handle_t *handle);
|
||||
|
||||
/**
|
||||
* @brief Get AEC mode string
|
||||
*
|
||||
* @param aec_mode The mode of AEC.
|
||||
*
|
||||
* @return AEC mode string
|
||||
*/
|
||||
char * aec_get_mode_string(aec_mode_t aec_mode);
|
||||
|
||||
/**
|
||||
* @brief Free the AEC instance
|
||||
@ -104,7 +96,7 @@ void aec_process(const aec_handle_t inst, int16_t *indata, int16_t *refdata, int
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void aec_destroy(aec_handle_t inst);
|
||||
void aec_destroy(aec_handle_t *handel);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@ -1,24 +1,41 @@
|
||||
#pragma once
|
||||
#include "stdint.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdlib.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#include "esp_wn_models.h"
|
||||
#include "esp_vad.h"
|
||||
|
||||
#include "esp_aec.h"
|
||||
#include "esp_agc.h"
|
||||
#include "model_path.h"
|
||||
#include "esp_vadn_models.h"
|
||||
#include "esp_nsn_models.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
//AFE: Audio Front-End
|
||||
//SR: Speech Recognition
|
||||
//afe_sr/AFE_SR: the audio front-end for speech recognition
|
||||
|
||||
//VC: Voice Communication
|
||||
|
||||
//Set AFE_SR mode
|
||||
typedef enum {
|
||||
SR_MODE_LOW_COST = 0,
|
||||
SR_MODE_HIGH_PERF = 1
|
||||
SR_MODE_LOW_COST = 0, //Deprecated, please use afe_mode_t, AFE mode: low cost mode
|
||||
SR_MODE_HIGH_PERF = 1, //Deprecated, please use afe_mode_t, AFE mode: high performance mode
|
||||
} afe_sr_mode_t;
|
||||
|
||||
//Set AFE mode
|
||||
typedef enum {
|
||||
AFE_MODE_LOW_COST = 0, // AFE mode: low cost mode
|
||||
AFE_MODE_HIGH_PERF = 1, // AFE mode: high performance mode
|
||||
} afe_mode_t;
|
||||
|
||||
//Set AFE type
|
||||
typedef enum {
|
||||
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
|
||||
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
|
||||
} afe_type_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_MEMORY_ALLOC_MORE_INTERNAL = 1, // malloc with more internal ram
|
||||
AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE = 2, // malloc with internal ram and psram in balance
|
||||
@ -26,24 +43,30 @@ typedef enum {
|
||||
} afe_memory_alloc_mode_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of audio fed to multinet is -9dB
|
||||
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of audio fed to multinet is -6dB
|
||||
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of audio fed to multinet is -3dB
|
||||
AFE_MN_PEAK_AGC_MODE_1 = -9, // The peak amplitude of fetch audio is -9dB
|
||||
AFE_MN_PEAK_AGC_MODE_2 = -6, // The peak amplitude of fetch audio is -6dB
|
||||
AFE_MN_PEAK_AGC_MODE_3 = -3, // The peak amplitude of fetcg is -3dB
|
||||
AFE_MN_PEAK_NO_AGC = 0, // There is no agc gain
|
||||
} afe_mn_peak_agc_mode_t;
|
||||
|
||||
typedef struct {
|
||||
int total_ch_num; // total channel num. It must be: total_ch_num = mic_num + ref_num
|
||||
int mic_num; // mic channel num
|
||||
int ref_num; // reference channel num
|
||||
int sample_rate; // sample rate of audio
|
||||
int total_ch_num; // total channel num, include microphone channel, playback channel and unknown channel
|
||||
int mic_num; // microphone channel number
|
||||
uint8_t* mic_ids; // microphone channel indices
|
||||
int ref_num; // playback reference channel number
|
||||
uint8_t* ref_ids; // playback reference channel indices
|
||||
int sample_rate; // sample rate of audio
|
||||
} afe_pcm_config_t;
|
||||
|
||||
typedef enum {
|
||||
NS_MODE_SSP = 0, // speech signal process method
|
||||
NS_MODE_NET = 1, // deep noise suppression net method
|
||||
AFE_NS_MODE_WEBRTC = 0, // please use model name of NS, SSP: "WEBRTC"
|
||||
AFE_NS_MODE_NET = 1, // please use model name of NSNET
|
||||
} afe_ns_mode_t;
|
||||
|
||||
typedef enum {
|
||||
AFE_AGC_MODE_WEBRTC = 0, // WEBRTC AGC
|
||||
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
|
||||
} afe_agc_mode_t;
|
||||
|
||||
/**
|
||||
* @brief Function to get the debug audio data
|
||||
@ -66,148 +89,192 @@ typedef struct {
|
||||
} afe_debug_hook_t;
|
||||
|
||||
typedef struct {
|
||||
bool aec_init;
|
||||
bool se_init;
|
||||
bool vad_init;
|
||||
/********** AEC(Acoustic Echo Cancellation) **********/
|
||||
bool aec_init; // Whether to init aec
|
||||
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
|
||||
int aec_filter_length; // The filter length of aec
|
||||
|
||||
/********** SE(Speech Enhancement, microphone array processing) **********/
|
||||
bool se_init; // Whether to init se
|
||||
|
||||
/********** NS(Noise Suppression) **********/
|
||||
bool ns_init; // Whether to init ns
|
||||
char *ns_model_name; // Model name of ns
|
||||
afe_ns_mode_t afe_ns_mode; // Model mode of ns
|
||||
|
||||
/********** VAD(Voice Activity Detection) **********/
|
||||
bool vad_init; // Whether to init vad
|
||||
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
|
||||
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
|
||||
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
|
||||
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: 1000 ms
|
||||
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
|
||||
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
|
||||
|
||||
/********** WakeNet(Wake Word Engine) **********/
|
||||
bool wakenet_init;
|
||||
bool voice_communication_init;
|
||||
bool voice_communication_agc_init; // AGC swich for voice communication
|
||||
int voice_communication_agc_gain; // AGC gain(dB) for voice communication
|
||||
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
|
||||
char *wakenet_model_name; // The model name of wakenet 1
|
||||
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
|
||||
det_mode_t wakenet_mode;
|
||||
afe_sr_mode_t afe_mode;
|
||||
int afe_perferred_core;
|
||||
int afe_perferred_priority;
|
||||
int afe_ringbuf_size;
|
||||
afe_memory_alloc_mode_t memory_alloc_mode;
|
||||
float afe_linear_gain; // The linear gain for sr output(note: invaild for vc), the value should be in [0.1, 10.0].
|
||||
// This value acts directly on the output amplitude: out_linear_gain * amplitude.
|
||||
afe_mn_peak_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
|
||||
det_mode_t wakenet_mode; // The mode of wakenet
|
||||
|
||||
/********** AGC(Automatic Gain Control) **********/
|
||||
bool agc_init; // Whether to init agc
|
||||
afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
|
||||
int agc_compression_gain_db; // Compression gain in dB (default 9)
|
||||
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
|
||||
|
||||
/********** General AFE(Audio Front End) parameter **********/
|
||||
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
|
||||
afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
|
||||
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
|
||||
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
|
||||
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
|
||||
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts directly on the output amplitude: out_linear_gain * amplitude.
|
||||
bool debug_init;
|
||||
afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX];
|
||||
afe_ns_mode_t afe_ns_mode;
|
||||
char *afe_ns_model_name;
|
||||
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
|
||||
// otherwise, select channel number by wakenet
|
||||
char *vad_model_name; // The model name of vad, support vadnet1 and vadnet1_small
|
||||
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms
|
||||
int vad_min_noise_ms; // The minimum duration of noise/silence in ms. It should be bigger than 64 ms
|
||||
bool vad_mute_playback; // If true, the playback will be muted for vad detection
|
||||
} afe_config_t;
|
||||
|
||||
/**
|
||||
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based on the chip target and input format.
|
||||
* You can manually fine-tune it after creating the configuration
|
||||
*
|
||||
* The input format:
|
||||
* M to represent the microphone channel
|
||||
* R to represent the playback reference channel
|
||||
* N to represent an unknown or unused channel
|
||||
*
|
||||
* For example, input_format="MMNR" indicates that the input data consists of four channels,
|
||||
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
|
||||
*
|
||||
* @param input_format The input format
|
||||
* @param models Models from partition, which is configured by Kconfig
|
||||
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
|
||||
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
*
|
||||
* @return afe_config_t* The default config of afe
|
||||
*/
|
||||
afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
|
||||
|
||||
#if CONFIG_IDF_TARGET_ESP32
|
||||
#define AFE_CONFIG_DEFAULT() { \
|
||||
.aec_init = true, \
|
||||
.se_init = true, \
|
||||
.vad_init = true, \
|
||||
.wakenet_init = true, \
|
||||
.voice_communication_init = false, \
|
||||
.voice_communication_agc_init = false, \
|
||||
.voice_communication_agc_gain = 15, \
|
||||
.vad_mode = VAD_MODE_3, \
|
||||
.wakenet_model_name = NULL, \
|
||||
.wakenet_model_name_2 = NULL, \
|
||||
.wakenet_mode = DET_MODE_90, \
|
||||
.afe_mode = SR_MODE_HIGH_PERF, \
|
||||
.afe_perferred_core = 0, \
|
||||
.afe_perferred_priority = 5, \
|
||||
.afe_ringbuf_size = 50, \
|
||||
.memory_alloc_mode = AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE, \
|
||||
.afe_linear_gain = 1.0, \
|
||||
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
|
||||
.pcm_config = { \
|
||||
.total_ch_num = 2, \
|
||||
.mic_num = 1, \
|
||||
.ref_num = 1, \
|
||||
.sample_rate = 16000, \
|
||||
}, \
|
||||
.debug_init = false, \
|
||||
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
|
||||
.afe_ns_mode = NS_MODE_SSP, \
|
||||
.afe_ns_model_name = NULL, \
|
||||
.fixed_first_channel = true, \
|
||||
.vad_model_name = NULL, \
|
||||
.vad_min_speech_ms = 64, \
|
||||
.vad_min_noise_ms = 256, \
|
||||
.vad_mute_playback = false, \
|
||||
}
|
||||
#elif CONFIG_IDF_TARGET_ESP32P4
|
||||
#define AFE_CONFIG_DEFAULT() { \
|
||||
.aec_init = true, \
|
||||
.se_init = true, \
|
||||
.vad_init = true, \
|
||||
.wakenet_init = true, \
|
||||
.voice_communication_init = false, \
|
||||
.voice_communication_agc_init = false, \
|
||||
.voice_communication_agc_gain = 15, \
|
||||
.vad_mode = VAD_MODE_3, \
|
||||
.wakenet_model_name = NULL, \
|
||||
.wakenet_model_name_2 = NULL, \
|
||||
.wakenet_mode = DET_MODE_90, \
|
||||
.afe_mode = SR_MODE_LOW_COST, \
|
||||
.afe_perferred_core = 0, \
|
||||
.afe_perferred_priority = 5, \
|
||||
.afe_ringbuf_size = 50, \
|
||||
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
|
||||
.afe_linear_gain = 1.0, \
|
||||
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
|
||||
.pcm_config = { \
|
||||
.total_ch_num = 2, \
|
||||
.mic_num = 1, \
|
||||
.ref_num = 1, \
|
||||
.sample_rate = 16000, \
|
||||
}, \
|
||||
.debug_init = false, \
|
||||
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
|
||||
.afe_ns_mode = NS_MODE_SSP, \
|
||||
.afe_ns_model_name = NULL, \
|
||||
.fixed_first_channel = true, \
|
||||
.vad_model_name = NULL, \
|
||||
.vad_min_speech_ms = 64, \
|
||||
.vad_min_noise_ms = 256, \
|
||||
.vad_mute_playback = false, \
|
||||
}
|
||||
#elif CONFIG_IDF_TARGET_ESP32S3
|
||||
#define AFE_CONFIG_DEFAULT() { \
|
||||
.aec_init = true, \
|
||||
.se_init = true, \
|
||||
.vad_init = true, \
|
||||
.wakenet_init = true, \
|
||||
.voice_communication_init = false, \
|
||||
.voice_communication_agc_init = false, \
|
||||
.voice_communication_agc_gain = 15, \
|
||||
.vad_mode = VAD_MODE_3, \
|
||||
.wakenet_model_name = NULL, \
|
||||
.wakenet_model_name_2 = NULL, \
|
||||
.wakenet_mode = DET_MODE_2CH_90, \
|
||||
.afe_mode = SR_MODE_LOW_COST, \
|
||||
.afe_perferred_core = 0, \
|
||||
.afe_perferred_priority = 5, \
|
||||
.afe_ringbuf_size = 50, \
|
||||
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
|
||||
.afe_linear_gain = 1.0, \
|
||||
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
|
||||
.pcm_config = { \
|
||||
.total_ch_num = 3, \
|
||||
.mic_num = 2, \
|
||||
.ref_num = 1, \
|
||||
.sample_rate = 16000, \
|
||||
}, \
|
||||
.debug_init = false, \
|
||||
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
|
||||
.afe_ns_mode = NS_MODE_SSP, \
|
||||
.afe_ns_model_name = NULL, \
|
||||
.fixed_first_channel = true, \
|
||||
.vad_model_name = NULL, \
|
||||
.vad_min_speech_ms = 64, \
|
||||
.vad_min_noise_ms = 256, \
|
||||
.vad_mute_playback = false, \
|
||||
}
|
||||
#endif
|
||||
/**
|
||||
* @brief Check AFE configuration and make sure it is correct.
|
||||
*
|
||||
* @warning If there is a configuration conflict, this function will modify some parameters.
|
||||
* The guiding behind these modifications is to maintain the highest performance of the output audio and results.
|
||||
* And remove the conflict between different algorithms.
|
||||
*
|
||||
* For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
|
||||
* If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
|
||||
*
|
||||
* @param afe_config Input AFE config
|
||||
*
|
||||
* @return afe_config_t* The modified AFE config
|
||||
*/
|
||||
afe_config_t *afe_config_check(afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Parse input format
|
||||
*
|
||||
* @param input_format The input format, same with afe_config_init() function
|
||||
* @param pcm_config The pcm config
|
||||
*
|
||||
* @return true if the input format is parsed successfully, otherwise false
|
||||
*/
|
||||
bool afe_parse_input_format(const char* input_format, afe_pcm_config_t* pcm_config);
|
||||
|
||||
/**
|
||||
* @brief Parse I2S input data
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param mic_data The output microphone data
|
||||
* @param ref_data The output playback reference data
|
||||
* @param pcm_config The pcm config
|
||||
*
|
||||
*/
|
||||
void afe_parse_input(int16_t *data, int frame_size, int16_t* mic_data, int16_t* ref_data, afe_pcm_config_t* pcm_config);
|
||||
|
||||
/**
|
||||
* @brief Parse input data, from interleaved arrangement to contiguous arrangement
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param channel_num The channel number of data
|
||||
* @param out_data The output data
|
||||
*
|
||||
*/
|
||||
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
|
||||
|
||||
/**
|
||||
* @brief Format input data, from contiguous arrangement to interleaved arrangement
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param channel_num The channel number of data
|
||||
* @param out_data The output data
|
||||
*
|
||||
*/
|
||||
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t* out_data);
|
||||
|
||||
/**
|
||||
* @brief Adjust the gain of input data
|
||||
*
|
||||
* @warning the input data will be modified inplace.
|
||||
*
|
||||
* @param data The input audio data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param factor The gain factor
|
||||
*
|
||||
* @return int16_t* The output audio data
|
||||
*/
|
||||
int16_t* afe_adjust_gain(int16_t *data, int frame_size, float factor);
|
||||
|
||||
/**
|
||||
* @brief Adjust the gain of input data
|
||||
*
|
||||
* @warning the input data will be modified inplace.
|
||||
*
|
||||
* @param in_data The input audio data
|
||||
* @param in_frame_size Input data frame size of input
|
||||
* @param channel_num The channel number of input data, which is same as output data
|
||||
* @param out_data The output audio data
|
||||
* @param out_frame_size Onput data frame size of input
|
||||
*
|
||||
*/
|
||||
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t * out_data, int out_frame_size);
|
||||
|
||||
/**
|
||||
* @brief Copy the afe config
|
||||
*
|
||||
* @param dst_config The destination afe config
|
||||
* @param src_config The source afe config
|
||||
*
|
||||
* @return The destination afe config
|
||||
*/
|
||||
afe_config_t* afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
|
||||
|
||||
/**
|
||||
* @brief Print the afe config
|
||||
*
|
||||
* @param afe_config The afe config
|
||||
*/
|
||||
void afe_config_print(const afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Allocate afe config
|
||||
*
|
||||
* @return The afe config pointer
|
||||
*/
|
||||
afe_config_t *afe_config_alloc();
|
||||
|
||||
/**
|
||||
* @brief Free afe config
|
||||
*
|
||||
* @param afe_config The afe config pointer
|
||||
*/
|
||||
void afe_config_free(afe_config_t *afe_config);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@ -1,7 +1,10 @@
|
||||
#pragma once
|
||||
#include "stdint.h"
|
||||
#include "stdlib.h"
|
||||
#include "stdbool.h"
|
||||
#include "esp_afe_config.h"
|
||||
|
||||
#include "freertos/FreeRTOS.h"
|
||||
#include "freertos/task.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@ -13,13 +16,15 @@ extern "C" {
|
||||
//Opaque AFE_SR data container
|
||||
typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
|
||||
|
||||
|
||||
|
||||
/**
|
||||
* @brief The state of vad
|
||||
*/
|
||||
typedef enum
|
||||
{
|
||||
AFE_VAD_SILENCE = 0, // noise or silence
|
||||
AFE_VAD_SPEECH // speech
|
||||
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
|
||||
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
|
||||
} afe_vad_state_t;
|
||||
|
||||
/**
|
||||
@ -27,7 +32,7 @@ typedef enum
|
||||
*/
|
||||
typedef struct afe_fetch_result_t
|
||||
{
|
||||
int16_t *data; // the data of audio.
|
||||
int16_t *data; // the target channel data of audio.
|
||||
int data_size; // the size of data. The unit is byte.
|
||||
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
|
||||
int vad_cache_size; // the size of vad_cache. The unit is byte.
|
||||
@ -36,10 +41,12 @@ typedef struct afe_fetch_result_t
|
||||
wakenet_state_t wakeup_state; // the value is wakenet_state_t
|
||||
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
|
||||
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
|
||||
afe_vad_state_t vad_state; // the value is afe_vad_state_t
|
||||
vad_state_t vad_state; // the value is afe_vad_state_t
|
||||
int trigger_channel_id; // the channel index of output
|
||||
int wake_word_length; // the length of wake word. The unit is the number of samples.
|
||||
int ret_value; // the return state of fetch function
|
||||
int16_t *raw_data; // the multi-channel output data of audio.
|
||||
int raw_data_channels; // the channel number of raw data
|
||||
void* reserved; // reserved for future use
|
||||
} afe_fetch_result_t;
|
||||
|
||||
@ -63,19 +70,11 @@ typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_confi
|
||||
typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the total channel number which be config
|
||||
* @brief Get the channel number
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of total channels
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the mic channel number which be config
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of mic channels
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
@ -104,12 +103,24 @@ typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t*
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR
|
||||
*
|
||||
* @Warning The output is single channel data, no matter how many channels the input is.
|
||||
* Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
|
||||
*/
|
||||
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
|
||||
*
|
||||
* @Warning The output is single channel data, no matter how many channels the input is.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result.
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
|
||||
*/
|
||||
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
|
||||
|
||||
/**
|
||||
* @brief reset ringbuf of AFE.
|
||||
*
|
||||
@ -129,52 +140,37 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
|
||||
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char* model_name);
|
||||
|
||||
/**
|
||||
* @brief Disable wakenet model.
|
||||
* @brief Enable VAD algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 0: disabled, 1: enabled
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_disable_wakenet_t)(esp_afe_sr_data_t *afe);
|
||||
typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Enable wakenet model.
|
||||
* @brief Disable one function/module/algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 0: disabled, 1: enabled
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_enable_wakenet_t)(esp_afe_sr_data_t *afe);
|
||||
typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Disable AEC algorithm.
|
||||
* @brief Enable one function/module/algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 0: disabled, 1: enabled
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_disable_aec_t)(esp_afe_sr_data_t *afe);
|
||||
typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Enable AEC algorithm.
|
||||
* @brief Print all functions/modules/algorithms pipeline.
|
||||
* The pipeline is the order of the functions/modules/algorithms.
|
||||
* The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output]
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 0: disabled, 1: enabled
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_enable_aec_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Disable SE algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 0: disabled, 1: enabled
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_disable_se_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Enable SE algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 0: disabled, 1: enabled
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_enable_se_t)(esp_afe_sr_data_t *afe);
|
||||
typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Destroy a AFE_SR instance
|
||||
@ -191,22 +187,41 @@ typedef struct {
|
||||
esp_afe_sr_iface_op_create_from_config_t create_from_config;
|
||||
esp_afe_sr_iface_op_feed_t feed;
|
||||
esp_afe_sr_iface_op_fetch_t fetch;
|
||||
esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay;
|
||||
esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
|
||||
esp_afe_sr_iface_op_get_total_channel_num_t get_total_channel_num;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_channel_num;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
|
||||
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_afe_sr_iface_op_set_wakenet_t set_wakenet;
|
||||
esp_afe_sr_iface_op_disable_wakenet_t disable_wakenet;
|
||||
esp_afe_sr_iface_op_enable_wakenet_t enable_wakenet;
|
||||
esp_afe_sr_iface_op_disable_aec_t disable_aec;
|
||||
esp_afe_sr_iface_op_enable_aec_t enable_aec;
|
||||
esp_afe_sr_iface_op_disable_se_t disable_se;
|
||||
esp_afe_sr_iface_op_enable_se_t enable_se;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_wakenet;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_wakenet;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_aec;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_aec;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_se;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_se;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_vad;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_vad;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_ns;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_ns;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_agc;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_agc;
|
||||
esp_afe_sr_iface_op_print_pipeline_t print_pipeline;
|
||||
esp_afe_sr_iface_op_destroy_t destroy;
|
||||
} esp_afe_sr_iface_t;
|
||||
|
||||
|
||||
// struct is used to store the AFE handle and data for the AFE task
|
||||
typedef struct
|
||||
{
|
||||
esp_afe_sr_data_t *afe_data;
|
||||
esp_afe_sr_iface_t *afe_handle;
|
||||
TaskHandle_t feed_task;
|
||||
TaskHandle_t fetch_task;
|
||||
}afe_task_into_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@ -6,17 +6,7 @@ extern "C" {
|
||||
|
||||
#include "esp_afe_sr_iface.h"
|
||||
|
||||
|
||||
#if CONFIG_AFE_INTERFACE_V1
|
||||
extern const esp_afe_sr_iface_t esp_afe_sr_v1;
|
||||
extern const esp_afe_sr_iface_t esp_afe_vc_v1;
|
||||
#define ESP_AFE_SR_HANDLE esp_afe_sr_v1
|
||||
#define ESP_AFE_VC_HANDLE esp_afe_vc_v1
|
||||
|
||||
#else
|
||||
#error No valid afe selected.
|
||||
#endif
|
||||
|
||||
esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
@ -26,8 +26,15 @@ typedef enum {
|
||||
ESP_AGC_FRAME_SIZE_ERROR = -3, ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size
|
||||
} ESP_AGE_ERR;
|
||||
|
||||
typedef enum {
|
||||
AGC_MODE_SR = -1, // Bypass WEBRTC AGC
|
||||
AGC_MODE_0 = 0, // Only saturation protection
|
||||
AGC_MODE_1 = 1, // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
|
||||
AGC_MODE_2 = 2, // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
|
||||
AGC_MODE_3 = 3, // Fixed Digital Gain [compressionGaindB (default 8 dB)]
|
||||
} agc_mode_t;
|
||||
|
||||
void *esp_agc_open(int agc_mode, int sample_rate);
|
||||
void *esp_agc_open(agc_mode_t agc_mode, int sample_rate);
|
||||
void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs);
|
||||
int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate);
|
||||
void esp_agc_close(void *agc_handle);
|
||||
|
||||
@ -78,6 +78,8 @@ vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
|
||||
typedef struct {
|
||||
vad_trigger_t *trigger;
|
||||
void *vad_inst;
|
||||
int sample_rate;
|
||||
int frame_size;
|
||||
}vad_handle_with_trigger_t;
|
||||
|
||||
typedef vad_handle_with_trigger_t* vad_handle_t;
|
||||
@ -100,31 +102,41 @@ vad_handle_t vad_create(vad_mode_t vad_mode);
|
||||
* @brief Creates an instance to the VAD structure.
|
||||
*
|
||||
* @param vad_mode Sets the VAD operating mode.
|
||||
* @param min_speech_len Minimum frame number of speech duration
|
||||
* @param min_noise_len Minimum frame number of noise duration
|
||||
* @param sample_rate Sample rate in Hz
|
||||
* @param one_frame_ms Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30.
|
||||
* @param min_speech_ms Minimum speech duration, unit is ms
|
||||
* @param min_noise_ms Minimum noise duration, unit is ms
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of VAD
|
||||
*/
|
||||
vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len);
|
||||
vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
|
||||
*
|
||||
* @param inst The instance of VAD.
|
||||
*
|
||||
* @param data An array of 16-bit signed audio samples.
|
||||
*
|
||||
* @param handle The instance of VAD.
|
||||
* @param data An array of 16-bit signed audio samples.
|
||||
* @param sample_rate_hz The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
|
||||
*
|
||||
* @param one_frame_ms The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
|
||||
*
|
||||
* @return
|
||||
* - VAD_SILENCE if no voice
|
||||
* - VAD_SPEECH if voice is detected
|
||||
*
|
||||
*/
|
||||
vad_state_t vad_process(vad_handle_t inst, int16_t *data, int sample_rate_hz, int one_frame_ms);
|
||||
vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
|
||||
*
|
||||
* @param handle The instance of VAD.
|
||||
* @param data An array of 16-bit signed audio samples.
|
||||
* @return
|
||||
* - VAD_SILENCE if no voice
|
||||
* - VAD_SPEECH if voice is detected
|
||||
*
|
||||
*/
|
||||
vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
|
||||
|
||||
/**
|
||||
* @brief Free the VAD instance
|
||||
|
||||
164
include/esp32/esp_vadn_iface.h
Normal file
164
include/esp32/esp_vadn_iface.h
Normal file
@ -0,0 +1,164 @@
|
||||
#pragma once
|
||||
#include "esp_vad.h"
|
||||
#include "stdint.h"
|
||||
#include "dl_lib_convq_queue.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Opaque model data container
|
||||
typedef struct model_iface_data_t model_iface_data_t;
|
||||
|
||||
// /**
|
||||
// * @brief The state of vad
|
||||
// */
|
||||
// typedef enum {
|
||||
// VAD_NOISE = -1, // Noise
|
||||
// VADNET_STATE_SILENCE = 0, // Silence
|
||||
// VAD_SPEECH = 1 // Speech
|
||||
// } vad_state_t;
|
||||
|
||||
/**
|
||||
* @brief Easy function type to initialze a model instance with a detection mode
|
||||
* and specified model name
|
||||
*
|
||||
* @param model_name The specified model name
|
||||
* @param mode The voice activity detection mode
|
||||
* @param channel_num The number of input audio channels
|
||||
* @param min_speech_ms The minimum duration of speech in ms to trigger vad
|
||||
* speech
|
||||
* @param min_noise_ms The minimum duration of noise in ms to trigger vad
|
||||
* noise
|
||||
* @returns Handle to the model data
|
||||
*/
|
||||
typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)(
|
||||
const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of samples that need to be passed to the detect
|
||||
* function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the
|
||||
* same time. This function can be used to query that amount. Note that the
|
||||
* returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the channel number of samples that need to be passed to the detect
|
||||
* function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the
|
||||
* same time. This function can be used to query that amount. Note that the
|
||||
* returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the detect function
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Set the detection threshold to manually abjust the probability
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param det_treshold The threshold to trigger wake words, the range of
|
||||
* det_threshold is 0.5~0.9999
|
||||
* @return 0: setting failed, 1: setting success
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
|
||||
|
||||
/**
|
||||
* @brief Get the voice activity detection threshold
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @returns the detection threshold
|
||||
*/
|
||||
typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the vad model and detect whether is
|
||||
* voice.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param samples An array of 16-bit signed audio samples. The array size used
|
||||
* can be queried by the get_samp_chunksize function.
|
||||
* @return The index of wake words, return 0 if no wake word is detected, else
|
||||
* the index of the wake words.
|
||||
*/
|
||||
typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
|
||||
|
||||
/**
|
||||
* @brief Feed MFCC of an audio stream to the vad model and detect whether is
|
||||
* voice.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param cq An array of 16-bit MFCC.
|
||||
* @return The index of wake words, return 0 if no wake word is detected, else
|
||||
* the index of the wake words.
|
||||
*/
|
||||
typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq);
|
||||
|
||||
/**
|
||||
* @brief Get MFCC of an audio stream
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return MFCC data
|
||||
*/
|
||||
typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the triggered channel index. Channel index starts from zero
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The channel index
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Clean all states of model
|
||||
*
|
||||
* @param model The model object to query
|
||||
*/
|
||||
typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Destroy a model object
|
||||
*
|
||||
* @param model Model object to destroy
|
||||
*/
|
||||
typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a voice
|
||||
* activity detection model.
|
||||
*/
|
||||
typedef struct {
|
||||
esp_vadn_iface_op_create_t create;
|
||||
esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
||||
esp_vadn_iface_op_get_channel_num_t get_channel_num;
|
||||
esp_vadn_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_vadn_iface_op_set_det_threshold_t set_det_threshold;
|
||||
esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
|
||||
esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
|
||||
esp_vadn_iface_op_detect_t detect;
|
||||
esp_vadn_iface_op_detect_mfcc_t detect_mfcc;
|
||||
esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data;
|
||||
esp_vadn_iface_op_clean_t clean;
|
||||
esp_vadn_iface_op_destroy_t destroy;
|
||||
} esp_vadn_iface_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
22
include/esp32/esp_vadn_models.h
Normal file
22
include/esp32/esp_vadn_models.h
Normal file
@ -0,0 +1,22 @@
|
||||
#pragma once
|
||||
#include "esp_vadn_iface.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// The prefix of vadnet model name is used to filter all wakenet from availabel models.
|
||||
#define ESP_VADN_PREFIX "vadnet"
|
||||
|
||||
/**
|
||||
* @brief Get the wakenet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of wakenet
|
||||
*/
|
||||
const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
90
include/esp32/esp_webrtc.h
Normal file
90
include/esp32/esp_webrtc.h
Normal file
@ -0,0 +1,90 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_WEBRTC_H_
|
||||
#define _ESP_WEBRTC_H_
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#include <stdint.h>
|
||||
#include "sr_ringbuf.h"
|
||||
#include "esp_log.h"
|
||||
#include "esp_agc.h"
|
||||
#include "esp_ns.h"
|
||||
|
||||
#include "esp_heap_caps.h"
|
||||
|
||||
typedef struct {
|
||||
void* ns_handle;
|
||||
void* agc_handle;
|
||||
int frame_size;
|
||||
int sample_rate;
|
||||
int16_t *buff;
|
||||
int16_t *out_data;
|
||||
sr_ringbuf_handle_t rb;
|
||||
}webrtc_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance of webrtc.
|
||||
*
|
||||
* @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms.
|
||||
*
|
||||
* @param frame_length_ms The length of the audio processing
|
||||
* @param ns_mode The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive
|
||||
* @param agc_mode The model of AGC
|
||||
* @param agc_gain The gain of AGC. default is 9
|
||||
* @param agc_target_level The target level of AGC. default is -3 dbfs
|
||||
* @param sample_rate The sample rate of the audio.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of webrtc
|
||||
*/
|
||||
webrtc_handle_t* webrtc_create(
|
||||
int frame_length_ms,
|
||||
int ns_mode,
|
||||
agc_mode_t agc_mode,
|
||||
int agc_gain,
|
||||
int agc_target_level,
|
||||
int sample_rate);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression.
|
||||
*
|
||||
* @param handle The instance of NS.
|
||||
* @param in_data An array of 16-bit signed audio samples.
|
||||
* @param out_size The sample size of output data
|
||||
* @param enable_ns Enable noise suppression
|
||||
* @param enable_agc Enable automatic gain control
|
||||
*
|
||||
* @return data after noise suppression
|
||||
*/
|
||||
int16_t* webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc);
|
||||
|
||||
/**
|
||||
* @brief Free the webrtc instance
|
||||
*
|
||||
* @param handle The instance of webrtc.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void webrtc_destroy(webrtc_handle_t *handle);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //_ESP_NS_H_
|
||||
@ -1,5 +1,6 @@
|
||||
#pragma once
|
||||
#include "stdint.h"
|
||||
#include "dl_lib_convq_queue.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
@ -167,6 +168,25 @@ typedef void (*esp_wn_iface_op_clean_t)(model_iface_data_t *model);
|
||||
*/
|
||||
typedef void (*esp_wn_iface_op_destroy_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Feed MFCC of an audio stream to the vad model and detect whether is
|
||||
* voice.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param cq An array of 16-bit MFCC.
|
||||
* @return The index of wake words, return 0 if no wake word is detected, else
|
||||
* the index of the wake words.
|
||||
*/
|
||||
typedef wakenet_state_t (*esp_wn_iface_op_detect_mfcc_t)(model_iface_data_t *model, int16_t *samples, dl_convq_queue_t *cq);
|
||||
|
||||
/**
|
||||
* @brief Get MFCC of an audio stream
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return MFCC data
|
||||
*/
|
||||
typedef dl_convq_queue_t* (*esp_wn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
|
||||
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a wake word detection model.
|
||||
@ -184,6 +204,8 @@ typedef struct {
|
||||
esp_wn_iface_op_get_triggered_channel_t get_triggered_channel;
|
||||
esp_wn_iface_op_get_vol_gain_t get_vol_gain;
|
||||
esp_wn_iface_op_detect_t detect;
|
||||
esp_wn_iface_op_detect_mfcc_t detect_mfcc;
|
||||
esp_wn_iface_op_get_mfcc_data_t get_mfcc_data;
|
||||
esp_wn_iface_op_clean_t clean;
|
||||
esp_wn_iface_op_destroy_t destroy;
|
||||
} esp_wn_iface_t;
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
15
test_apps/esp-sr/sdkconfig.ci.mn2_cn
Normal file
15
test_apps/esp-sr/sdkconfig.ci.mn2_cn
Normal file
@ -0,0 +1,15 @@
|
||||
# This file was generated using idf.py save-defconfig. It can be edited manually.
|
||||
# Espressif IoT Development Framework (ESP-IDF) 5.3.0 Project Minimal Configuration
|
||||
#
|
||||
CONFIG_IDF_TARGET="esp32"
|
||||
CONFIG_APP_RETRIEVE_LEN_ELF_SHA=16
|
||||
CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
|
||||
CONFIG_ESPTOOLPY_FLASHFREQ_80M=y
|
||||
CONFIG_ESPTOOLPY_FLASHSIZE_8MB=y
|
||||
CONFIG_PARTITION_TABLE_CUSTOM=y
|
||||
CONFIG_PARTITION_TABLE_CUSTOM_FILENAME="partitions_esp32.csv"
|
||||
CONFIG_SR_MN_CN_MULTINET2_SINGLE_RECOGNITION=y
|
||||
CONFIG_ESP_WIFI_GMAC_SUPPORT=n
|
||||
CONFIG_LWIP_TCP_SND_BUF_DEFAULT=5744
|
||||
CONFIG_LWIP_TCP_WND_DEFAULT=5744
|
||||
CONFIG_UNITY_CRITICAL_LEAK_LEVEL_GENERAL=1024
|
||||
@ -1,23 +1,6 @@
|
||||
# This file was generated using idf.py save-defconfig. It can be edited manually.
|
||||
# Espressif IoT Development Framework (ESP-IDF) 5.3.1 Project Minimal Configuration
|
||||
#
|
||||
CONFIG_IDF_TARGET="esp32p4"
|
||||
CONFIG_ESPTOOLPY_FLASHMODE_QIO=y
|
||||
CONFIG_ESPTOOLPY_FLASHSIZE_16MB=y
|
||||
|
||||
CONFIG_PARTITION_TABLE_CUSTOM=y
|
||||
CONFIG_SR_NSN_NSNET2=y
|
||||
CONFIG_SR_VADN_VADNET1_MEDIUM=y
|
||||
CONFIG_SR_WN_WN9_HIESP=y
|
||||
CONFIG_SR_MN_EN_MULTINET7_QUANT=y
|
||||
CONFIG_COMPILER_OPTIMIZATION_PERF=y
|
||||
CONFIG_ESP32P4_REV_MIN_0=y
|
||||
CONFIG_SPIRAM=y
|
||||
CONFIG_SPIRAM_SPEED_200M=y
|
||||
CONFIG_CACHE_L2_CACHE_256KB=y
|
||||
CONFIG_CACHE_L2_CACHE_LINE_128B=y
|
||||
CONFIG_ESP_SYSTEM_ALLOW_RTC_FAST_MEM_AS_HEAP=n
|
||||
CONFIG_ESP_INT_WDT=n
|
||||
CONFIG_ESP_TASK_WDT_EN=n
|
||||
CONFIG_FREERTOS_HZ=1000
|
||||
CONFIG_MBEDTLS_CMAC_C=y
|
||||
CONFIG_IDF_EXPERIMENTAL_FEATURES=y
|
||||
CONFIG_SR_VADN_VADNET1_MEDIUM=y
|
||||
Loading…
Reference in New Issue
Block a user