mirror of
https://github.com/espressif/esp-sr.git
synced 2025-09-15 15:28:44 +08:00
feat: update esp32c3,esp32c5,esp32c6,esp32s2 lib to support afe
This commit is contained in:
parent
127e75d617
commit
8636daabf6
@ -1,4 +1,4 @@
|
||||
if((${IDF_TARGET} STREQUAL "esp32s3") OR (${IDF_TARGET} STREQUAL "esp32p4") OR (${IDF_TARGET} STREQUAL "esp32"))
|
||||
if((${IDF_TARGET} STREQUAL "esp32s3") OR (${IDF_TARGET} STREQUAL "esp32p4") OR (${IDF_TARGET} STREQUAL "esp32") OR (${IDF_TARGET} STREQUAL "esp32c3") OR (${IDF_TARGET} STREQUAL "esp32c5") OR (${IDF_TARGET} STREQUAL "esp32c6") OR (${IDF_TARGET} STREQUAL "esp32s2"))
|
||||
set(include_dirs
|
||||
"esp-tts/esp_tts_chinese/include"
|
||||
"include/${IDF_TARGET}"
|
||||
@ -46,9 +46,9 @@ if((${IDF_TARGET} STREQUAL "esp32s3") OR (${IDF_TARGET} STREQUAL "esp32p4") OR (
|
||||
idf_component_get_property(dl_fft_lib espressif__dl_fft COMPONENT_LIB)
|
||||
|
||||
set(sr_libs
|
||||
dl_lib
|
||||
$<TARGET_FILE:${esp_dsp_lib}>
|
||||
$<TARGET_FILE:${dl_fft_lib}>
|
||||
dl_lib
|
||||
c_speech_features
|
||||
esp_audio_front_end
|
||||
esp_audio_processor
|
||||
@ -72,48 +72,6 @@ if((${IDF_TARGET} STREQUAL "esp32s3") OR (${IDF_TARGET} STREQUAL "esp32p4") OR (
|
||||
"-Wl,--end-group")
|
||||
|
||||
|
||||
elseif((${IDF_TARGET} STREQUAL "esp32c5") OR (${IDF_TARGET} STREQUAL "esp32c3") OR (${IDF_TARGET} STREQUAL "esp32c6") OR (${IDF_TARGET} STREQUAL "esp32s2"))
|
||||
set(srcs
|
||||
"src/model_path.c"
|
||||
)
|
||||
|
||||
set(include_dirs
|
||||
"include/${IDF_TARGET}"
|
||||
"src/include"
|
||||
"esp-tts/esp_tts_chinese/include"
|
||||
)
|
||||
|
||||
set(requires
|
||||
json
|
||||
spiffs
|
||||
esp_partition
|
||||
)
|
||||
|
||||
idf_component_register(SRCS ${srcs}
|
||||
INCLUDE_DIRS ${include_dirs}
|
||||
REQUIRES ${requires}
|
||||
PRIV_REQUIRES spi_flash
|
||||
)
|
||||
|
||||
component_compile_options(-ffast-math -O3 -Wno-error=format=-Wno-format)
|
||||
add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME})
|
||||
add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME})
|
||||
add_prebuilt_library(dl_lib "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libdl_lib.a" PRIV_REQUIRES ${COMPONENT_NAME})
|
||||
add_prebuilt_library(c_speech_features "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libc_speech_features.a" PRIV_REQUIRES ${COMPONENT_NAME})
|
||||
add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME})
|
||||
add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
|
||||
add_prebuilt_library(esp_tts_chinese "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libesp_tts_chinese.a" PRIV_REQUIRES ${COMPONENT_NAME})
|
||||
add_prebuilt_library(voice_set_xiaole "${CMAKE_CURRENT_SOURCE_DIR}/esp-tts/esp_tts_chinese/${IDF_TARGET}/libvoice_set_xiaole.a" PRIV_REQUIRES ${COMPONENT_NAME})
|
||||
|
||||
target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_processor)
|
||||
target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_front_end)
|
||||
target_link_libraries(${COMPONENT_LIB} PRIVATE dl_lib)
|
||||
target_link_libraries(${COMPONENT_LIB} PRIVATE c_speech_features)
|
||||
target_link_libraries(${COMPONENT_LIB} PRIVATE hufzip)
|
||||
target_link_libraries(${COMPONENT_LIB} PRIVATE wakenet)
|
||||
target_link_libraries(${COMPONENT_LIB} PRIVATE esp_tts_chinese)
|
||||
target_link_libraries(${COMPONENT_LIB} PRIVATE voice_set_xiaole)
|
||||
|
||||
endif()
|
||||
|
||||
# Add model partition and flash srmodels.bin
|
||||
|
||||
@ -12,7 +12,7 @@ esp_mfcc_opts_t *get_mfcc_opts_wn9();
|
||||
/**
|
||||
* @brief Return basic opts used in wakenet9s
|
||||
**/
|
||||
esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
|
||||
esp_mfcc_opts_t *get_mfcc_opts(const char *win_type, bool use_power, int winstep_ms, int winlen_ms, int nfilter);
|
||||
|
||||
/**
|
||||
* @brief Return basic opts for default kaldifeat
|
||||
|
||||
@ -2,9 +2,8 @@
|
||||
#ifndef _ESP_AFE_AEC_H_
|
||||
#define _ESP_AFE_AEC_H_
|
||||
|
||||
|
||||
#include "esp_afe_config.h"
|
||||
#include "esp_aec.h"
|
||||
#include "esp_afe_config.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
@ -13,19 +12,19 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
aec_handle_t* handle;
|
||||
aec_handle_t *handle;
|
||||
aec_mode_t mode;
|
||||
afe_pcm_config_t pcm_config;
|
||||
int frame_size;
|
||||
int16_t *data;
|
||||
}afe_aec_handle_t;
|
||||
|
||||
int16_t *data;
|
||||
} afe_aec_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the AEC structure.
|
||||
*
|
||||
* @warning Currently only support 1 microphone channel and 1 playback channe.
|
||||
* If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
|
||||
* @brief Creates an instance to the AEC structure.
|
||||
*
|
||||
* @warning Currently only support 1 microphone channel and 1 playback channe.
|
||||
* If input has multiple microphone channels and playback channels, just the first microphone channel and playback
|
||||
* channel will be selected.
|
||||
*
|
||||
* The input format, same as afe config:
|
||||
* M to represent the microphone channel
|
||||
@ -37,7 +36,8 @@ typedef struct {
|
||||
*
|
||||
* @param input_format The input format
|
||||
* @param filter_length The length of filter. The larger the filter, the higher the CPU loading.
|
||||
* Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
|
||||
* Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for
|
||||
* esp32c5.
|
||||
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
|
||||
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
*
|
||||
@ -45,17 +45,17 @@ typedef struct {
|
||||
*/
|
||||
afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
|
||||
*
|
||||
*
|
||||
* @param inst The instance of AEC.
|
||||
* @param indata Input audio data, format is define by input_format. Note indata will be modified in function call.
|
||||
* @param outdata Returns near-end signal with echo removed.
|
||||
* @param indata Input audio data, format is define by input_format.
|
||||
* @param outdata Near-end signal with echo removed. outdata must be 16-bit aligned.
|
||||
* please use heap_caps_aligned_calloc(16, n, size, caps) to allocate an aligned chunk of memory
|
||||
|
||||
* @return The bytes of outdata.
|
||||
*/
|
||||
size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
|
||||
size_t afe_aec_process(afe_aec_handle_t *handel, const int16_t *indata, int16_t *outdata);
|
||||
|
||||
/**
|
||||
* @brief Get frame size of AEC (the samples of one frame)
|
||||
@ -64,7 +64,6 @@ size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outda
|
||||
*/
|
||||
int afe_aec_get_chunksize(afe_aec_handle_t *handle);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Free the AEC instance
|
||||
*
|
||||
|
||||
@ -1,9 +1,15 @@
|
||||
#pragma once
|
||||
#include "esp_aec.h"
|
||||
#include "esp_agc.h"
|
||||
#include "esp_nsn_models.h"
|
||||
#include "esp_vad.h"
|
||||
#include "esp_vadn_models.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#include "esp_wn_models.h"
|
||||
#include "model_path.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdint.h"
|
||||
#include "stdlib.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@ -27,7 +33,8 @@ typedef enum {
|
||||
// Set AFE type
|
||||
typedef enum {
|
||||
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
|
||||
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
|
||||
AFE_TYPE_VC = 1, // Voice communication scenarios, 16KHz input, including nonlinear noise suppression
|
||||
AFE_TYPE_VC_8K = 2, // Voice communication scenarios, 8KHz input, note that the input data must be 8KHz
|
||||
} afe_type_t;
|
||||
|
||||
typedef enum {
|
||||
@ -62,8 +69,220 @@ typedef enum {
|
||||
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
|
||||
} afe_agc_mode_t;
|
||||
|
||||
/**
|
||||
* @brief Function to get the debug audio data
|
||||
*
|
||||
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that
|
||||
* avoid blocking for too long.
|
||||
* @param data_size The number of bytes of data.
|
||||
* @returns
|
||||
*/
|
||||
typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);
|
||||
|
||||
typedef enum {
|
||||
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
|
||||
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
|
||||
AFE_DEBUG_HOOK_MAX = 2
|
||||
} afe_debug_hook_type_t;
|
||||
|
||||
typedef struct {
|
||||
afe_debug_hook_type_t hook_type; // debug type of hook
|
||||
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
|
||||
} afe_debug_hook_t;
|
||||
|
||||
typedef struct {
|
||||
/********** AEC(Acoustic Echo Cancellation) **********/
|
||||
bool aec_init; // Whether to init aec
|
||||
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
|
||||
int aec_filter_length; // The filter length of aec
|
||||
|
||||
/********** SE(Speech Enhancement, microphone array processing) **********/
|
||||
bool se_init; // Whether to init se
|
||||
|
||||
/********** NS(Noise Suppression) **********/
|
||||
bool ns_init; // Whether to init ns
|
||||
char *ns_model_name; // Model name of ns
|
||||
afe_ns_mode_t afe_ns_mode; // Model mode of ns
|
||||
|
||||
/********** VAD(Voice Activity Detection) **********/
|
||||
bool vad_init; // Whether to init vad
|
||||
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
|
||||
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
|
||||
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
|
||||
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
|
||||
// 1000 ms
|
||||
int vad_delay_ms; // The delay of the first speech frame in ms, default: 128 ms
|
||||
// If you find vad cache can not cover all speech, please increase this value.
|
||||
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
|
||||
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
|
||||
|
||||
/********** WakeNet(Wake Word Engine) **********/
|
||||
bool wakenet_init;
|
||||
char *wakenet_model_name; // The model name of wakenet 1
|
||||
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
|
||||
det_mode_t wakenet_mode; // The mode of wakenet
|
||||
|
||||
/********** AGC(Automatic Gain Control) **********/
|
||||
bool agc_init; // Whether to init agc
|
||||
afe_agc_mode_t
|
||||
agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
|
||||
int agc_compression_gain_db; // Compression gain in dB (default 9)
|
||||
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default 3, means target level is -3 dBFS)
|
||||
|
||||
/********** General AFE(Audio Front End) parameter **********/
|
||||
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
|
||||
afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
|
||||
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
|
||||
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
|
||||
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
|
||||
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
|
||||
// directly on the output amplitude: out_linear_gain * amplitude.
|
||||
bool debug_init;
|
||||
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
|
||||
// otherwise, select channel number by wakenet
|
||||
} afe_config_t;
|
||||
|
||||
/**
|
||||
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
|
||||
* on the chip target and input format. You can manually fine-tune it after creating the configuration
|
||||
*
|
||||
* The input format:
|
||||
* M to represent the microphone channel
|
||||
* R to represent the playback reference channel
|
||||
* N to represent an unknown or unused channel
|
||||
*
|
||||
* For example, input_format="MMNR" indicates that the input data consists of four channels,
|
||||
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
|
||||
*
|
||||
* @param input_format The input format
|
||||
* @param models Models from partition, which is configured by Kconfig
|
||||
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
|
||||
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
*
|
||||
* @return afe_config_t* The default config of afe
|
||||
*/
|
||||
afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
|
||||
|
||||
/**
|
||||
* @brief Check AFE configuration and make sure it is correct.
|
||||
*
|
||||
* @warning If there is a configuration conflict, this function will modify some parameters.
|
||||
* The guiding behind these modifications is to maintain the highest performance of the output audio and results.
|
||||
* And remove the conflict between different algorithms.
|
||||
*
|
||||
* For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
|
||||
* If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
|
||||
*
|
||||
* @param afe_config Input AFE config
|
||||
*
|
||||
* @return afe_config_t* The modified AFE config
|
||||
*/
|
||||
afe_config_t *afe_config_check(afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Parse input format
|
||||
*
|
||||
* @param input_format The input format, same with afe_config_init() function
|
||||
* @param pcm_config The pcm config
|
||||
*
|
||||
* @return true if the input format is parsed successfully, otherwise false
|
||||
*/
|
||||
bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);
|
||||
|
||||
/**
|
||||
* @brief Parse I2S input data
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param mic_data The output microphone data
|
||||
* @param ref_data The output playback reference data
|
||||
* @param pcm_config The pcm config
|
||||
*
|
||||
*/
|
||||
void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);
|
||||
|
||||
/**
|
||||
* @brief Parse input data, from interleaved arrangement to contiguous arrangement
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param channel_num The channel number of data
|
||||
* @param out_data The output data
|
||||
*
|
||||
*/
|
||||
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Format input data, from contiguous arrangement to interleaved arrangement
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param channel_num The channel number of data
|
||||
* @param out_data The output data
|
||||
*
|
||||
*/
|
||||
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Adjust the gain of input data
|
||||
*
|
||||
* @warning the input data will be modified inplace.
|
||||
*
|
||||
* @param data The input audio data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param factor The gain factor
|
||||
*
|
||||
* @return int16_t* The output audio data
|
||||
*/
|
||||
int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);
|
||||
|
||||
/**
|
||||
* @brief Adjust the gain of input data
|
||||
*
|
||||
* @warning the input data will be modified inplace.
|
||||
*
|
||||
* @param in_data The input audio data
|
||||
* @param in_frame_size Input data frame size of input
|
||||
* @param channel_num The channel number of input data, which is same as output data
|
||||
* @param out_data The output audio data
|
||||
* @param out_frame_size Onput data frame size of input
|
||||
*
|
||||
*/
|
||||
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);
|
||||
|
||||
/**
|
||||
* @brief Copy the afe config
|
||||
*
|
||||
* @param dst_config The destination afe config
|
||||
* @param src_config The source afe config
|
||||
*
|
||||
* @return The destination afe config
|
||||
*/
|
||||
afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
|
||||
|
||||
/**
|
||||
* @brief Print the afe config
|
||||
*
|
||||
* @param afe_config The afe config
|
||||
*/
|
||||
void afe_config_print(const afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Allocate afe config
|
||||
*
|
||||
* @return The afe config pointer
|
||||
*/
|
||||
afe_config_t *afe_config_alloc();
|
||||
|
||||
/**
|
||||
* @brief Free afe config
|
||||
*
|
||||
* @param afe_config The afe config pointer
|
||||
*/
|
||||
void afe_config_free(afe_config_t *afe_config);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
48
include/esp32c3/esp_afe_doa.h
Normal file
48
include/esp32c3/esp_afe_doa.h
Normal file
@ -0,0 +1,48 @@
|
||||
#ifndef _ESP_AFE_DOA_H_
|
||||
#define _ESP_AFE_DOA_H_
|
||||
|
||||
#include "esp_doa.h"
|
||||
#include "esp_afe_config.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
doa_handle_t *doa_handle;
|
||||
afe_pcm_config_t pcm_config;
|
||||
int16_t *leftdata;
|
||||
int16_t *rightdata;
|
||||
int frame_size;
|
||||
} afe_doa_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Initialize SRP-PHAT processor
|
||||
* @param input_format The input format
|
||||
* @param fs Sampling rate (Hz), e.g., 16000
|
||||
* @param resolution Angular search resolution (degrees), e.g., 20
|
||||
* @param d_mics Microphone spacing (meters), e.g., 0.06
|
||||
* @param input_timedate_samples input timedate samples, e.g., 1024
|
||||
* @return Initialized doa_handle_t object pointer, Recommend using the above configuration for better performance
|
||||
*/
|
||||
afe_doa_handle_t *afe_doa_create(const char *input_format, int fs, float resolution, float d_mics, int input_timedate_samples);
|
||||
/**
|
||||
* @brief Process audio frame for direction estimation
|
||||
* @param handle doa_handle_t instance pointer
|
||||
* @param indata Input audio data, format is define by input_format.
|
||||
* @return Estimated sound direction in degrees, e.g., 0-180
|
||||
*/
|
||||
float afe_doa_process(afe_doa_handle_t *handle, const int16_t *indata);
|
||||
/**
|
||||
* @brief Release all allocated resources
|
||||
* @param doa doa_handle_t instance pointer to be freed
|
||||
*/
|
||||
void afe_doa_destroy(afe_doa_handle_t *handle);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ESP_AFE_DOA_H_ */
|
||||
237
include/esp32c3/esp_afe_sr_iface.h
Normal file
237
include/esp32c3/esp_afe_sr_iface.h
Normal file
@ -0,0 +1,237 @@
|
||||
#pragma once
|
||||
#include "esp_afe_config.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdint.h"
|
||||
#include "stdlib.h"
|
||||
#include "freertos/FreeRTOS.h"
|
||||
#include "freertos/task.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// AFE: Audio Front-End
|
||||
// SR: Speech Recognition
|
||||
// afe_sr/AFE_SR: the audio front-end for speech recognition
|
||||
|
||||
// Opaque AFE_SR data container
|
||||
typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
|
||||
|
||||
/**
|
||||
* @brief The state of vad
|
||||
*/
|
||||
typedef enum {
|
||||
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
|
||||
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
|
||||
} afe_vad_state_t;
|
||||
|
||||
/**
|
||||
* @brief The result of fetch function
|
||||
*/
|
||||
typedef struct afe_fetch_result_t {
|
||||
int16_t *data; // the target channel data of audio.
|
||||
int data_size; // the size of data. The unit is byte.
|
||||
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
|
||||
// audio that was truncated.
|
||||
int vad_cache_size; // the size of vad_cache. The unit is byte.
|
||||
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
|
||||
// (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
|
||||
// wakenet(about 1.5s), otherwise is the frame length.
|
||||
wakenet_state_t wakeup_state; // the value is wakenet_state_t
|
||||
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
|
||||
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
|
||||
// start from 1.
|
||||
vad_state_t vad_state; // the value is afe_vad_state_t
|
||||
int trigger_channel_id; // the channel index of output
|
||||
int wake_word_length; // the length of wake word. The unit is the number of samples.
|
||||
int ret_value; // the return state of fetch function
|
||||
int16_t *raw_data; // the multi-channel output data of audio.
|
||||
int raw_data_channels; // the channel number of raw data
|
||||
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is buzy.
|
||||
void *reserved; // reserved for future use
|
||||
} afe_fetch_result_t;
|
||||
|
||||
/**
|
||||
* @brief Function to initialze a AFE_SR instance
|
||||
*
|
||||
* @param afe_config The config of AFE_SR
|
||||
* @returns Handle to the AFE_SR data
|
||||
*/
|
||||
typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of each channel samples per frame that need to be passed to the function
|
||||
*
|
||||
* Every speech enhancement AFE_SR processes a certain number of samples at the same time. This function
|
||||
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of samples to feed the fetch function
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the channel number
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of total channels
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the function
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the AFE_SR
|
||||
*
|
||||
* @Warning The input data should be arranged in the format of channel interleaving.
|
||||
* The last channel is reference signal if it has reference data.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
*
|
||||
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
|
||||
* `get_feed_chunksize`.
|
||||
* @return The size of input
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);
|
||||
|
||||
/**
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR
|
||||
*
|
||||
* @Warning The output is single channel data, no matter how many channels the input is.
|
||||
* Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
|
||||
* audio can be queried by the `get_fetch_chunksize`.)
|
||||
*/
|
||||
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
|
||||
*
|
||||
* @Warning The output is single channel data, no matter how many channels the input is.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result.
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
|
||||
* audio can be queried by the `get_fetch_chunksize`.)
|
||||
*/
|
||||
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
|
||||
|
||||
/**
|
||||
* @brief reset ringbuf of AFE.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Set wakenet detection threshold
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param index The wakenet index, just support 1: wakenet1 or 2: wakenet2
|
||||
* @param threshold The wakenet detection threshold, the value is between 0.4 and 0.9999.
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_set_wakenet_threshold_t)(esp_afe_sr_data_t *afe, int index, float threshold);
|
||||
|
||||
/**
|
||||
* @brief Reset wakenet detection threshold to inital state
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param index The wakenet index, just support 1: wakenet1 or 2: wakenet2
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_reset_wakenet_threshold_t)(esp_afe_sr_data_t *afe, int index);
|
||||
|
||||
/**
|
||||
* @brief Reset one function/module/algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Disable one function/module/algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 0: disabled, 1: enabled
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Enable one function/module/algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 0: disabled, 1: enabled
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Print all functions/modules/algorithms pipeline.
|
||||
* The pipeline is the order of the functions/modules/algorithms.
|
||||
* The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output]
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
*/
|
||||
typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Destroy a AFE_SR instance
|
||||
*
|
||||
* @param afe AFE_SR object to destroy
|
||||
*/
|
||||
typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a AFE_SR.
|
||||
*/
|
||||
typedef struct {
|
||||
esp_afe_sr_iface_op_create_from_config_t create_from_config;
|
||||
esp_afe_sr_iface_op_feed_t feed;
|
||||
esp_afe_sr_iface_op_fetch_t fetch;
|
||||
esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay;
|
||||
esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
|
||||
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_afe_sr_iface_op_set_wakenet_threshold_t set_wakenet_threshold;
|
||||
esp_afe_sr_iface_op_reset_wakenet_threshold_t reset_wakenet_threshold;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_wakenet;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_wakenet;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_aec;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_aec;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_se;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_se;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_vad;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_vad;
|
||||
esp_afe_sr_iface_op_reset_op_t reset_vad;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_ns;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_ns;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_agc;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_agc;
|
||||
esp_afe_sr_iface_op_print_pipeline_t print_pipeline;
|
||||
esp_afe_sr_iface_op_destroy_t destroy;
|
||||
} esp_afe_sr_iface_t;
|
||||
|
||||
// struct is used to store the AFE handle and data for the AFE task
|
||||
typedef struct {
|
||||
esp_afe_sr_data_t *afe_data;
|
||||
esp_afe_sr_iface_t *afe_handle;
|
||||
TaskHandle_t feed_task;
|
||||
TaskHandle_t fetch_task;
|
||||
} afe_task_into_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
13
include/esp32c3/esp_afe_sr_models.h
Normal file
13
include/esp32c3/esp_afe_sr_models.h
Normal file
@ -0,0 +1,13 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "esp_afe_sr_iface.h"
|
||||
|
||||
esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
47
include/esp32c3/esp_agc.h
Normal file
47
include/esp32c3/esp_agc.h
Normal file
@ -0,0 +1,47 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_AGC_H_
|
||||
#define _ESP_AGC_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
////all positive value is valid, negective is error
|
||||
typedef enum {
|
||||
ESP_AGC_SUCCESS = 0, ////success
|
||||
ESP_AGC_FAIL = -1, ////agc fail
|
||||
ESP_AGC_SAMPLE_RATE_ERROR = -2, ///sample rate can be only 8khz, 16khz, 32khz
|
||||
ESP_AGC_FRAME_SIZE_ERROR = -3, ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size
|
||||
} ESP_AGE_ERR;
|
||||
|
||||
typedef enum {
|
||||
AGC_MODE_SR = -1, // Bypass WEBRTC AGC
|
||||
AGC_MODE_0 = 0, // Only saturation protection
|
||||
AGC_MODE_1 = 1, // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
|
||||
AGC_MODE_2 = 2, // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
|
||||
AGC_MODE_3 = 3, // Fixed Digital Gain [compressionGaindB (default 8 dB)]
|
||||
} agc_mode_t;
|
||||
|
||||
void *esp_agc_open(agc_mode_t agc_mode, int sample_rate);
|
||||
void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs);
|
||||
int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate);
|
||||
void esp_agc_close(void *agc_handle);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // _ESP_AGC_H_
|
||||
41
include/esp32c3/esp_doa.h
Normal file
41
include/esp32c3/esp_doa.h
Normal file
@ -0,0 +1,41 @@
|
||||
#ifndef _ESP_DOA_H_
|
||||
#define _ESP_DOA_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct doa_handle_t doa_handle_t;
|
||||
/**
|
||||
* @brief Initialize SRP-PHAT processor
|
||||
* @param fs Sampling rate (Hz), e.g., 16000
|
||||
* @param resolution Angular search resolution (degrees), e.g., 20
|
||||
* @param d_mics Microphone spacing (meters), e.g., 0.06
|
||||
* @param input_timedate_samples input timedate samples, e.g., 1024
|
||||
* @return Initialized doa_handle_t object pointer, Recommend using the above configuration for better performance
|
||||
*/
|
||||
doa_handle_t *esp_doa_create(int fs, float resolution, float d_mics, int input_timedate_samples);
|
||||
|
||||
/**
|
||||
* @brief Release all allocated resources
|
||||
* @param doa doa_handle_t instance pointer to be freed
|
||||
*/
|
||||
void esp_doa_destroy(doa_handle_t *doa);
|
||||
|
||||
/**
|
||||
* @brief Process audio frame for direction estimation
|
||||
* @param doa doa_handle_t instance pointer
|
||||
* @param left Left channel 16-bit PCM data
|
||||
* @param right Right channel 16-bit PCM data
|
||||
* @return Estimated sound direction in degrees, e.g., 0-180
|
||||
*/
|
||||
float esp_doa_process(doa_handle_t *doa, int16_t* left, int16_t* right);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ESP_DOA_H_ */
|
||||
93
include/esp32c3/esp_mase.h
Normal file
93
include/esp32c3/esp_mase.h
Normal file
@ -0,0 +1,93 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_MASE_H_
|
||||
#define _ESP_MASE_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define MASE_SAMPLE_RATE 16000 // Supports 16kHz only
|
||||
#define MASE_FRAME_SIZE 16 // Supports 16ms only
|
||||
#define MASE_MIC_DISTANCE 65 // According to physical design of mic-array
|
||||
|
||||
/**
|
||||
* @brief Sets mic-array type, currently 2-mic line array and 3-mic circular array
|
||||
* are supported.
|
||||
*/
|
||||
typedef enum {
|
||||
TWO_MIC_LINE = 0,
|
||||
THREE_MIC_CIRCLE = 1
|
||||
} mase_mic_array_type_t;
|
||||
|
||||
/**
|
||||
* @brief Sets operating mode, supporting normal mode and wake-up enhancement mode
|
||||
*/
|
||||
typedef enum {
|
||||
NORMAL_ENHANCEMENT_MODE = 0,
|
||||
WAKE_UP_ENHANCEMENT_MODE = 1
|
||||
} mase_op_mode_t;
|
||||
|
||||
typedef void* mase_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the MASE structure.
|
||||
*
|
||||
* @param sample_rate The sampling frequency (Hz) must be 16000.
|
||||
*
|
||||
* @param frame_size The length of the audio processing must be 16ms.
|
||||
*
|
||||
* @param array_type '0' for 2-mic line array and '1' for 3-mic circular array.
|
||||
*
|
||||
* @param mic_distance The distance between neiboring microphones in mm.
|
||||
*
|
||||
* @param operating_mode '0' for normal mode and '1' for wake-up enhanced mode.
|
||||
*
|
||||
* @param filter_strength Strengh of the mic-array speech enhancement, must be 0, 1, 2 or 3.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: An instance of MASE
|
||||
*/
|
||||
mase_handle_t mase_create(int fs, int frame_size, int array_type, float mic_distance, int operating_mode, int filter_strength);
|
||||
|
||||
/**
|
||||
* @brief Performs mic array processing for one frame.
|
||||
*
|
||||
* @param inst The instance of MASE.
|
||||
*
|
||||
* @param in An array of 16-bit signed audio samples from mic.
|
||||
*
|
||||
* @param dsp_out Returns enhanced signal.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void mase_process(mase_handle_t st, int16_t *in, int16_t *dsp_out);
|
||||
|
||||
/**
|
||||
* @brief Free the MASE instance
|
||||
*
|
||||
* @param inst The instance of MASE.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void mase_destory(mase_handle_t st);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@ -12,7 +12,7 @@ esp_mfcc_opts_t *get_mfcc_opts_wn9();
|
||||
/**
|
||||
* @brief Return basic opts used in wakenet9s
|
||||
**/
|
||||
esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
|
||||
esp_mfcc_opts_t *get_mfcc_opts(const char *win_type, bool use_power, int winstep_ms, int winlen_ms, int nfilter);
|
||||
|
||||
/**
|
||||
* @brief Return basic opts for default kaldifeat
|
||||
|
||||
223
include/esp32c3/esp_mn_iface.h
Normal file
223
include/esp32c3/esp_mn_iface.h
Normal file
@ -0,0 +1,223 @@
|
||||
#pragma once
|
||||
#include "stdint.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define ESP_MN_RESULT_MAX_NUM 5
|
||||
#define ESP_MN_MAX_PHRASE_NUM 400
|
||||
#define ESP_MN_MAX_PHRASE_LEN 63
|
||||
#define ESP_MN_MIN_PHRASE_LEN 2
|
||||
|
||||
#define ESP_MN_PREFIX "mn"
|
||||
#define ESP_MN_ENGLISH "en"
|
||||
#define ESP_MN_CHINESE "cn"
|
||||
|
||||
typedef enum {
|
||||
ESP_MN_STATE_DETECTING = 0, // detecting
|
||||
ESP_MN_STATE_DETECTED = 1, // detected
|
||||
ESP_MN_STATE_TIMEOUT = 2, // time out
|
||||
} esp_mn_state_t;
|
||||
|
||||
//Set multinet loading mode
|
||||
//The memory comsumption is decreased with increasing mode,
|
||||
//As a consequence also the CPU loading rate goes up
|
||||
typedef enum {
|
||||
ESP_MN_LOAD_FROM_PSRAM = 0, // Load all weights from PSRAM. Fastest computation with Maximum memory consumption
|
||||
ESP_MN_LOAD_FROM_PSRAM_FLASH = 1, // Load some weights from PSRAM and laod the rest from FLASH (default)
|
||||
ESP_MN_LOAD_FROM_FLASH = 2, // Load more weights from FLASH. Minimum memory consumption with slowest computation
|
||||
} esp_mn_loader_mode_t;
|
||||
|
||||
typedef enum {
|
||||
ESP_MN_GREEDY_SEARCH = 0, // greedy search
|
||||
ESP_MN_BEAM_SEARCH = 1, // beam search
|
||||
ESP_MN_BEAM_SEARCH_WITH_FST = 2, // beam search with trie language model
|
||||
} esp_mn_search_method_t;
|
||||
|
||||
typedef enum {
|
||||
CHINESE_ID = 1, // Chinese language
|
||||
ENGLISH_ID = 2, // English language
|
||||
} language_id_t;
|
||||
|
||||
// Return all possible recognition results
|
||||
typedef struct{
|
||||
esp_mn_state_t state;
|
||||
int num; // The number of phrase in list, num<=5. When num=0, no phrase is recognized.
|
||||
int command_id[ESP_MN_RESULT_MAX_NUM]; // The list of command id.
|
||||
int phrase_id[ESP_MN_RESULT_MAX_NUM]; // The list of phrase id.
|
||||
float prob[ESP_MN_RESULT_MAX_NUM]; // The list of probability.
|
||||
char string[256];
|
||||
} esp_mn_results_t;
|
||||
|
||||
typedef struct {
|
||||
char *string; // command string
|
||||
char *phonemes; // command phonemes, if applicable
|
||||
int16_t command_id; // the command id
|
||||
float threshold; // trigger threshold, default: 0
|
||||
int16_t *wave; // prompt wave data of the phrase
|
||||
} esp_mn_phrase_t;
|
||||
|
||||
typedef struct _mn_node_ {
|
||||
esp_mn_phrase_t *phrase;
|
||||
struct _mn_node_ *next;
|
||||
} esp_mn_node_t;
|
||||
|
||||
typedef struct{
|
||||
int16_t num; // The number of error phrases, which can not added into model
|
||||
esp_mn_phrase_t **phrases; // The array of error phrase pointer
|
||||
} esp_mn_error_t;
|
||||
|
||||
/**
|
||||
* @brief Initialze a model instance with specified model name.
|
||||
*
|
||||
* @param model_name The wakenet model name.
|
||||
* @param duration The duration (ms) to trigger the timeout
|
||||
*
|
||||
* @returns Handle to the model data.
|
||||
*/
|
||||
typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const char *model_name, int duration);
|
||||
|
||||
/**
|
||||
* @brief Switch multinet mode to change memory consumption and CPU loading
|
||||
*
|
||||
* @warning Just Support multinet6 or later versions
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param mode The multinet loader mode
|
||||
*
|
||||
* @returns Handle to the model data.
|
||||
*/
|
||||
typedef model_iface_data_t* (*esp_mn_iface_op_switch_loader_mode_t)(model_iface_data_t *model, esp_mn_loader_mode_t mode);
|
||||
|
||||
/**
|
||||
* @brief Callback function type to fetch the amount of samples that need to be passed to the detect function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the same time. This function
|
||||
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Callback function type to fetch the number of frames recognized by the command word
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The number of the frames recognized by the command word
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_get_samp_chunknum_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Set the detection threshold to manually abjust the probability
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param det_treshold The threshold to trigger speech commands, the range of det_threshold is 0.0~0.9999
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the detect function
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the language of model
|
||||
*
|
||||
* @param model The language name
|
||||
* @return Language name string defined in esp_mn_models.h, eg: ESP_MN_CHINESE, ESP_MN_ENGLISH
|
||||
*/
|
||||
typedef char * (*esp_mn_iface_op_get_language_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the speech recognition model and detect if there is a speech command found.
|
||||
*
|
||||
* @param model The model object to query.
|
||||
* @param samples An array of 16-bit signed audio samples. The array size used can be queried by the
|
||||
* get_samp_chunksize function.
|
||||
* @return The state of multinet
|
||||
*/
|
||||
typedef esp_mn_state_t (*esp_mn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
|
||||
|
||||
/**
|
||||
* @brief Destroy a speech commands recognition model
|
||||
*
|
||||
* @param model The Model object to destroy
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_destroy_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get recognition results
|
||||
*
|
||||
* @param model The Model object to query
|
||||
*
|
||||
* @return The current results.
|
||||
*/
|
||||
typedef esp_mn_results_t* (*esp_mn_iface_op_get_results_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Open the log print
|
||||
*
|
||||
* @param model_data The model object to query.
|
||||
*
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_open_log_t)(model_iface_data_t *model_data);
|
||||
|
||||
/**
|
||||
* @brief Clean all status of model
|
||||
*
|
||||
* @param model_data The model object to query.
|
||||
*
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_clean_t)(model_iface_data_t *model_data);
|
||||
|
||||
/**
|
||||
* @brief Set the speech commands by mn_command_root
|
||||
*
|
||||
* @param model_data The model object to query.
|
||||
* @param mn_command_root The speech commands link.
|
||||
* @return The error phrase id info.
|
||||
*/
|
||||
typedef esp_mn_error_t* (*esp_wn_iface_op_set_speech_commands)(model_iface_data_t *model_data, esp_mn_node_t *mn_command_root);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Print out current commands in fst, note the ones "added" but not "updated" will not be shown here
|
||||
*
|
||||
* @param model_data The model object to query
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_print_active_speech_commands)(model_iface_data_t *model_data);
|
||||
|
||||
/**
|
||||
* @brief Check if input string can be tokenized
|
||||
*
|
||||
* @param model_data The model object to query
|
||||
* @param str The input string
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_check_speech_command)(model_iface_data_t *model_data, const char *str);
|
||||
|
||||
typedef struct {
|
||||
esp_mn_iface_op_create_t create;
|
||||
esp_mn_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_mn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
||||
esp_mn_iface_op_get_samp_chunknum_t get_samp_chunknum;
|
||||
esp_mn_iface_op_set_det_threshold_t set_det_threshold;
|
||||
esp_mn_iface_op_get_language_t get_language;
|
||||
esp_mn_iface_op_detect_t detect;
|
||||
esp_mn_iface_op_destroy_t destroy;
|
||||
esp_mn_iface_op_get_results_t get_results;
|
||||
esp_mn_iface_op_open_log_t open_log;
|
||||
esp_mn_iface_op_clean_t clean;
|
||||
esp_wn_iface_op_set_speech_commands set_speech_commands;
|
||||
esp_mn_iface_op_switch_loader_mode_t switch_loader_mode;
|
||||
esp_mn_iface_op_print_active_speech_commands print_active_speech_commands;
|
||||
esp_mn_iface_op_check_speech_command check_speech_command;
|
||||
} esp_mn_iface_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
66
include/esp32c3/esp_mn_models.h
Normal file
66
include/esp32c3/esp_mn_models.h
Normal file
@ -0,0 +1,66 @@
|
||||
#pragma once
|
||||
#include "esp_mn_iface.h"
|
||||
|
||||
//Contains declarations of all available speech recognion models. Pair this up with the right coefficients and you have a model that can recognize
|
||||
//a specific phrase or word.
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
/**
|
||||
* @brief Get the multinet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of multinet
|
||||
*/
|
||||
esp_mn_iface_t *esp_mn_handle_from_name(char *model_name);
|
||||
|
||||
/**
|
||||
* @brief Get the multinet language from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The language of multinet
|
||||
*/
|
||||
char *esp_mn_language_from_name(char *model_name);
|
||||
|
||||
/*
|
||||
Configure wake word to use based on what's selected in menuconfig.
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_SR_MN_CN_MULTINET2_SINGLE_RECOGNITION
|
||||
#include "multinet2_ch.h"
|
||||
#define MULTINET_COEFF get_coeff_multinet2_ch
|
||||
#define MULTINET_MODEL_NAME "mn2_cn"
|
||||
|
||||
#else
|
||||
#define MULTINET_COEFF "COEFF_NULL"
|
||||
#define MULTINET_MODEL_NAME "NULL"
|
||||
#endif
|
||||
|
||||
|
||||
/* example
|
||||
|
||||
static const esp_mn_iface_t *multinet = &MULTINET_MODEL;
|
||||
|
||||
//Initialize MultiNet model data
|
||||
model_iface_data_t *model_data = multinet->create(&MULTINET_COEFF);
|
||||
add_speech_commands(multinet, model_data);
|
||||
|
||||
//Set parameters of buffer
|
||||
int audio_chunksize=model->get_samp_chunksize(model_data);
|
||||
int frequency = model->get_samp_rate(model_data);
|
||||
int16_t *buffer=malloc(audio_chunksize*sizeof(int16_t));
|
||||
|
||||
//Detect
|
||||
int r=model->detect(model_data, buffer);
|
||||
if (r>0) {
|
||||
printf("Detection triggered output %d.\n", r);
|
||||
}
|
||||
|
||||
//Destroy model
|
||||
model->destroy(model_data)
|
||||
|
||||
*/
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
86
include/esp32c3/esp_ns.h
Normal file
86
include/esp32c3/esp_ns.h
Normal file
@ -0,0 +1,86 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_NS_H_
|
||||
#define _ESP_NS_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define NS_USE_SPIARM 0
|
||||
#define NS_FRAME_LENGTH_MS 10 //Supports 10ms, 20ms, 30ms
|
||||
|
||||
/**
|
||||
* The Sampling frequency (Hz) must be 16000Hz
|
||||
*/
|
||||
|
||||
typedef void* ns_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the NS structure.
|
||||
*
|
||||
* @param frame_length The length of the audio processing can be 10ms, 20ms, 30ms.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of NS
|
||||
*/
|
||||
ns_handle_t ns_create(int frame_length);
|
||||
|
||||
/**
|
||||
* @brief Creates an instance of the more powerful noise suppression algorithm.
|
||||
*
|
||||
* @warning frame_length only supports be 10 ms.
|
||||
*
|
||||
* @param frame_length The length of the audio processing can only be 10ms.
|
||||
* @param mode 0: Mild, 1: Medium, 2: Aggressive
|
||||
* @param sample_rate The sample rate of the audio.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of NS
|
||||
*/
|
||||
ns_handle_t ns_pro_create(int frame_length, int mode, int sample_rate);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the NS and get the audio stream after Noise suppression.
|
||||
*
|
||||
* @param inst The instance of NS.
|
||||
*
|
||||
* @param indata An array of 16-bit signed audio samples.
|
||||
*
|
||||
* @param outdata An array of 16-bit signed audio samples after noise suppression.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void ns_process(ns_handle_t inst, int16_t *indata, int16_t *outdata);
|
||||
|
||||
/**
|
||||
* @brief Free the NS instance
|
||||
*
|
||||
* @param inst The instance of NS.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void ns_destroy(ns_handle_t inst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //_ESP_NS_H_
|
||||
64
include/esp32c3/esp_nsn_iface.h
Normal file
64
include/esp32c3/esp_nsn_iface.h
Normal file
@ -0,0 +1,64 @@
|
||||
#pragma once
|
||||
#include "stdint.h"
|
||||
|
||||
//Opaque model data container
|
||||
typedef struct esp_nsn_data_t esp_nsn_data_t;
|
||||
|
||||
|
||||
/**
|
||||
* @brief Easy function type to initialze a model instance
|
||||
*
|
||||
* @param model_name The name of the model instance
|
||||
* @returns Handle to the model data
|
||||
*/
|
||||
typedef esp_nsn_data_t* (*esp_nsn_iface_op_create_t)(char *model_name);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of samples that need to be passed to the process function
|
||||
*
|
||||
* Every noise suppression model processes a certain number of samples at the same time. This function
|
||||
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the process function
|
||||
*/
|
||||
typedef int (*esp_nsn_iface_op_get_samp_chunksize_t)(esp_nsn_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the noise suppression model and get data after process.
|
||||
*
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param in_data An array of 16-bit signed audio samples. The array size used can be queried by the
|
||||
* get_samp_chunksize function.
|
||||
* @param out_data An array of 16-bit signed audio samples after process.
|
||||
* @return The state of return.
|
||||
*/
|
||||
typedef int (*esp_nsn_iface_op_process_t)(esp_nsn_data_t *model, int16_t *in_data, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the process function
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_nsn_iface_op_get_samp_rate_t)(esp_nsn_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Destroy a noise suppression model
|
||||
*
|
||||
* @param model Model object to destroy
|
||||
*/
|
||||
typedef void (*esp_nsn_iface_op_destroy_t)(esp_nsn_data_t *model);
|
||||
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a wake word detection model.
|
||||
*/
|
||||
typedef struct {
|
||||
esp_nsn_iface_op_create_t create;
|
||||
esp_nsn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
||||
esp_nsn_iface_op_process_t process;
|
||||
esp_nsn_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_nsn_iface_op_destroy_t destroy;
|
||||
} esp_nsn_iface_t;
|
||||
17
include/esp32c3/esp_nsn_models.h
Normal file
17
include/esp32c3/esp_nsn_models.h
Normal file
@ -0,0 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#include "esp_nsn_iface.h"
|
||||
|
||||
/*
|
||||
The prefix of nset
|
||||
Now there are nsnet1 and nsnet2
|
||||
*/
|
||||
#define ESP_NSNET_PREFIX "nsnet"
|
||||
|
||||
/**
|
||||
* @brief Get the nsnet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of multinet
|
||||
*/
|
||||
esp_nsn_iface_t *esp_nsnet_handle_from_name(char *model_name);
|
||||
@ -48,7 +48,7 @@ float *esp_win_func_init(char *win_type, float *window_data, int frame_length);
|
||||
|
||||
float *esp_fftr(float *x, int nfft, void *fft_table);
|
||||
|
||||
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
|
||||
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_handle);
|
||||
|
||||
void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
|
||||
|
||||
|
||||
84
include/esp32c3/esp_sr_webrtc.h
Normal file
84
include/esp32c3/esp_sr_webrtc.h
Normal file
@ -0,0 +1,84 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_WEBRTC_H_
|
||||
#define _ESP_WEBRTC_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#include "esp_agc.h"
|
||||
#include "esp_log.h"
|
||||
#include "esp_ns.h"
|
||||
#include "sr_ringbuf.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#include "esp_heap_caps.h"
|
||||
|
||||
typedef struct {
|
||||
void *ns_handle;
|
||||
void *agc_handle;
|
||||
int frame_size;
|
||||
int sample_rate;
|
||||
int16_t *buff;
|
||||
int16_t *out_data;
|
||||
sr_ringbuf_handle_t rb;
|
||||
} webrtc_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance of webrtc.
|
||||
*
|
||||
* @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms.
|
||||
*
|
||||
* @param frame_length_ms The length of the audio processing
|
||||
* @param ns_mode The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive
|
||||
* @param agc_mode The model of AGC
|
||||
* @param agc_gain The gain of AGC. default is 9
|
||||
* @param agc_target_level The target level of AGC. default is -3 dbfs
|
||||
* @param sample_rate The sample rate of the audio.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of webrtc
|
||||
*/
|
||||
webrtc_handle_t *webrtc_create(
|
||||
int frame_length_ms, int ns_mode, agc_mode_t agc_mode, int agc_gain, int agc_target_level, int sample_rate);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression.
|
||||
*
|
||||
* @param handle The instance of NS.
|
||||
* @param in_data An array of 16-bit signed audio samples.
|
||||
* @param out_size The sample size of output data
|
||||
* @param enable_ns Enable noise suppression
|
||||
* @param enable_agc Enable automatic gain control
|
||||
*
|
||||
* @return data after noise suppression
|
||||
*/
|
||||
int16_t *webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc);
|
||||
|
||||
/**
|
||||
* @brief Free the webrtc instance
|
||||
*
|
||||
* @param handle The instance of webrtc.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void webrtc_destroy(webrtc_handle_t *handle);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //_ESP_NS_H_
|
||||
178
include/esp32c3/esp_vad.h
Normal file
178
include/esp32c3/esp_vad.h
Normal file
@ -0,0 +1,178 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_VAD_H_
|
||||
#define _ESP_VAD_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define SAMPLE_RATE_HZ 16000 // Supports 32000, 16000, 8000
|
||||
#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms
|
||||
|
||||
/**
|
||||
* @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
|
||||
* restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
|
||||
*/
|
||||
typedef enum {
|
||||
VAD_MODE_0 = 0, // Normal
|
||||
VAD_MODE_1, // Aggressive
|
||||
VAD_MODE_2, // Very Aggressive
|
||||
VAD_MODE_3, // Very Very Aggressive
|
||||
VAD_MODE_4 // Very Very Very Aggressive
|
||||
} vad_mode_t;
|
||||
|
||||
typedef enum {
|
||||
VAD_SILENCE = 0,
|
||||
VAD_SPEECH = 1,
|
||||
} vad_state_t;
|
||||
|
||||
typedef struct vad_trigger_tag {
|
||||
vad_state_t state;
|
||||
unsigned int min_speech_len;
|
||||
unsigned int noise_len;
|
||||
unsigned int min_noise_len;
|
||||
unsigned int speech_len;
|
||||
} vad_trigger_t;
|
||||
|
||||
#define vad_MAX_LEN INT32_MAX - 1
|
||||
/**
|
||||
* @brief Allocate wakenet trigger
|
||||
*
|
||||
* @param min_speech_len Minimum frame number of speech duration
|
||||
* @param min_noise_len Minimum frame number of noise duration
|
||||
*
|
||||
* @return Trigger pointer
|
||||
**/
|
||||
vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
|
||||
|
||||
/**
|
||||
* @brief Free wakenet trigger
|
||||
**/
|
||||
void vad_trigger_free(vad_trigger_t *trigger);
|
||||
|
||||
/**
|
||||
* @brief Reset wakenet trigger
|
||||
**/
|
||||
void vad_trigger_reset(vad_trigger_t *trigger);
|
||||
|
||||
/**
|
||||
* @brief detect activaty voice by trigger
|
||||
**/
|
||||
vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
|
||||
|
||||
typedef struct {
|
||||
vad_trigger_t *trigger;
|
||||
void *vad_inst;
|
||||
int sample_rate;
|
||||
int frame_size;
|
||||
} vad_handle_with_trigger_t;
|
||||
|
||||
typedef vad_handle_with_trigger_t *vad_handle_t;
|
||||
|
||||
// typedef vad_handle_tag * vad_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the VAD structure.
|
||||
*
|
||||
* @param vad_mode Sets the VAD operating mode.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of VAD
|
||||
*/
|
||||
vad_handle_t vad_create(vad_mode_t vad_mode);
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the VAD structure.
|
||||
*
|
||||
* @param vad_mode Sets the VAD operating mode.
|
||||
* @param sample_rate Sample rate in Hz
|
||||
* @param one_frame_ms Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30.
|
||||
* @param min_speech_ms Minimum speech duration, unit is ms
|
||||
* @param min_noise_ms Minimum noise duration, unit is ms
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of VAD
|
||||
*/
|
||||
vad_handle_t vad_create_with_param(
|
||||
vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
|
||||
*
|
||||
* @param handle The instance of VAD.
|
||||
* @param data An array of 16-bit signed audio samples.
|
||||
* @param sample_rate_hz The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
|
||||
* @param one_frame_ms The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
|
||||
* @return
|
||||
* - VAD_SILENCE if no voice
|
||||
* - VAD_SPEECH if voice is detected
|
||||
*
|
||||
*/
|
||||
vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
|
||||
*
|
||||
* @param handle The instance of VAD.
|
||||
* @param data An array of 16-bit signed audio samples.
|
||||
* @return
|
||||
* - VAD_SILENCE if no voice
|
||||
* - VAD_SPEECH if voice is detected
|
||||
*
|
||||
*/
|
||||
vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
|
||||
|
||||
/**
|
||||
* @brief Reset trigger state as Silence
|
||||
*
|
||||
* @param handle The instance of VAD.
|
||||
*/
|
||||
void vad_reset_trigger(vad_handle_t handle);
|
||||
|
||||
/**
|
||||
* @brief Free the VAD instance
|
||||
*
|
||||
* @param inst The instance of VAD.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void vad_destroy(vad_handle_t inst);
|
||||
|
||||
/*
|
||||
* Programming Guide:
|
||||
*
|
||||
* @code{c}
|
||||
* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to
|
||||
* the VAD structure.
|
||||
*
|
||||
* while (1) {
|
||||
* //Use buffer to receive the audio data from MIC.
|
||||
* vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result.
|
||||
* }
|
||||
*
|
||||
* vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process
|
||||
*
|
||||
* @endcode
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //_ESP_VAD_H_
|
||||
164
include/esp32c3/esp_vadn_iface.h
Normal file
164
include/esp32c3/esp_vadn_iface.h
Normal file
@ -0,0 +1,164 @@
|
||||
#pragma once
|
||||
#include "esp_vad.h"
|
||||
#include "stdint.h"
|
||||
#include "dl_lib_convq_queue.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Opaque model data container
|
||||
typedef struct model_iface_data_t model_iface_data_t;
|
||||
|
||||
// /**
|
||||
// * @brief The state of vad
|
||||
// */
|
||||
// typedef enum {
|
||||
// VAD_NOISE = -1, // Noise
|
||||
// VADNET_STATE_SILENCE = 0, // Silence
|
||||
// VAD_SPEECH = 1 // Speech
|
||||
// } vad_state_t;
|
||||
|
||||
/**
|
||||
* @brief Easy function type to initialze a model instance with a detection mode
|
||||
* and specified model name
|
||||
*
|
||||
* @param model_name The specified model name
|
||||
* @param mode The voice activity detection mode
|
||||
* @param channel_num The number of input audio channels
|
||||
* @param min_speech_ms The minimum duration of speech in ms to trigger vad
|
||||
* speech
|
||||
* @param min_noise_ms The minimum duration of noise in ms to trigger vad
|
||||
* noise
|
||||
* @returns Handle to the model data
|
||||
*/
|
||||
typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)(
|
||||
const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of samples that need to be passed to the detect
|
||||
* function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the
|
||||
* same time. This function can be used to query that amount. Note that the
|
||||
* returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the channel number of samples that need to be passed to the detect
|
||||
* function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the
|
||||
* same time. This function can be used to query that amount. Note that the
|
||||
* returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the detect function
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Set the detection threshold to manually abjust the probability
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param det_treshold The threshold to trigger wake words, the range of
|
||||
* det_threshold is 0.5~0.9999
|
||||
* @return 0: setting failed, 1: setting success
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
|
||||
|
||||
/**
|
||||
* @brief Get the voice activity detection threshold
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @returns the detection threshold
|
||||
*/
|
||||
typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the vad model and detect whether is
|
||||
* voice.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param samples An array of 16-bit signed audio samples. The array size used
|
||||
* can be queried by the get_samp_chunksize function.
|
||||
* @return The index of wake words, return 0 if no wake word is detected, else
|
||||
* the index of the wake words.
|
||||
*/
|
||||
typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
|
||||
|
||||
/**
|
||||
* @brief Feed MFCC of an audio stream to the vad model and detect whether is
|
||||
* voice.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param cq An array of 16-bit MFCC.
|
||||
* @return The index of wake words, return 0 if no wake word is detected, else
|
||||
* the index of the wake words.
|
||||
*/
|
||||
typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq);
|
||||
|
||||
/**
|
||||
* @brief Get MFCC of an audio stream
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return MFCC data
|
||||
*/
|
||||
typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the triggered channel index. Channel index starts from zero
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The channel index
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Clean all states of model
|
||||
*
|
||||
* @param model The model object to query
|
||||
*/
|
||||
typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Destroy a model object
|
||||
*
|
||||
* @param model Model object to destroy
|
||||
*/
|
||||
typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a voice
|
||||
* activity detection model.
|
||||
*/
|
||||
typedef struct {
|
||||
esp_vadn_iface_op_create_t create;
|
||||
esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
||||
esp_vadn_iface_op_get_channel_num_t get_channel_num;
|
||||
esp_vadn_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_vadn_iface_op_set_det_threshold_t set_det_threshold;
|
||||
esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
|
||||
esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
|
||||
esp_vadn_iface_op_detect_t detect;
|
||||
esp_vadn_iface_op_detect_mfcc_t detect_mfcc;
|
||||
esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data;
|
||||
esp_vadn_iface_op_clean_t clean;
|
||||
esp_vadn_iface_op_destroy_t destroy;
|
||||
} esp_vadn_iface_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
22
include/esp32c3/esp_vadn_models.h
Normal file
22
include/esp32c3/esp_vadn_models.h
Normal file
@ -0,0 +1,22 @@
|
||||
#pragma once
|
||||
#include "esp_vadn_iface.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// The prefix of vadnet model name is used to filter all wakenet from availabel models.
|
||||
#define ESP_VADN_PREFIX "vadnet"
|
||||
|
||||
/**
|
||||
* @brief Get the wakenet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of wakenet
|
||||
*/
|
||||
const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@ -29,6 +29,7 @@ typedef enum {
|
||||
DET_MODE_2CH_95 = 3,
|
||||
DET_MODE_3CH_90 = 4,
|
||||
DET_MODE_3CH_95 = 5,
|
||||
DET_MODE_90_COPY_PARAMS = 6, // Aggressive
|
||||
} det_mode_t;
|
||||
|
||||
typedef struct {
|
||||
@ -110,12 +111,21 @@ typedef char* (*esp_wn_iface_op_get_word_name_t)(model_iface_data_t *model, int
|
||||
* @brief Set the detection threshold to manually abjust the probability
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.5~0.9999
|
||||
* @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.4~0.9999
|
||||
* @param word_index The index of wake word
|
||||
* @return 0: setting failed, 1: setting success
|
||||
*/
|
||||
typedef int (*esp_wn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold, int word_index);
|
||||
|
||||
/**
|
||||
* @brief Reset the threshold to its initial state
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return 0: setting failed, 1: setting success
|
||||
*/
|
||||
typedef int (*esp_wn_iface_op_reset_det_threshold_t)(model_iface_data_t *model);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Get the wake word detection threshold of different modes
|
||||
*
|
||||
@ -200,6 +210,7 @@ typedef struct {
|
||||
esp_wn_iface_op_get_word_num_t get_word_num;
|
||||
esp_wn_iface_op_get_word_name_t get_word_name;
|
||||
esp_wn_iface_op_set_det_threshold_t set_det_threshold;
|
||||
esp_wn_iface_op_reset_det_threshold_t reset_det_threshold;
|
||||
esp_wn_iface_op_get_det_threshold_t get_det_threshold;
|
||||
esp_wn_iface_op_get_triggered_channel_t get_triggered_channel;
|
||||
esp_wn_iface_op_get_vol_gain_t get_vol_gain;
|
||||
|
||||
@ -11,7 +11,7 @@ extern "C" {
|
||||
/**
|
||||
* @brief Get the wakenet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of wakenet
|
||||
*/
|
||||
const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
|
||||
@ -19,10 +19,10 @@ const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
|
||||
/**
|
||||
* @brief Get the wake word name from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @param model_name The name of model
|
||||
* @returns The wake word name, like "alexa","hilexin","xiaoaitongxue"
|
||||
*/
|
||||
char* esp_wn_wakeword_from_name(const char *model_name);
|
||||
char *esp_wn_wakeword_from_name(const char *model_name);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
20
include/esp32c3/flite_g2p.h
Normal file
20
include/esp32c3/flite_g2p.h
Normal file
@ -0,0 +1,20 @@
|
||||
#ifndef __FLITE_G2P_H__
|
||||
#define __FLITE_G2P_H__
|
||||
|
||||
typedef struct {
|
||||
int num_phonemes;
|
||||
int phoneme_size;
|
||||
char **phonemes;
|
||||
} flite_g2p_result;
|
||||
|
||||
void flite_g2p_result_free(flite_g2p_result *result);
|
||||
|
||||
flite_g2p_result *flite_g2p_get_result(const char *grapheme);
|
||||
|
||||
void flite_g2p_result_print_string(flite_g2p_result *result, int map_phonemes);
|
||||
|
||||
char *flite_g2p_result_get_string(flite_g2p_result *result, int map_phonemes);
|
||||
|
||||
char *flite_g2p(const char *graphemes, int map_phonemes);
|
||||
|
||||
#endif
|
||||
@ -2,9 +2,8 @@
|
||||
#ifndef _ESP_AFE_AEC_H_
|
||||
#define _ESP_AFE_AEC_H_
|
||||
|
||||
|
||||
#include "esp_afe_config.h"
|
||||
#include "esp_aec.h"
|
||||
#include "esp_afe_config.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
@ -13,19 +12,19 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
aec_handle_t* handle;
|
||||
aec_handle_t *handle;
|
||||
aec_mode_t mode;
|
||||
afe_pcm_config_t pcm_config;
|
||||
int frame_size;
|
||||
int16_t *data;
|
||||
}afe_aec_handle_t;
|
||||
|
||||
int16_t *data;
|
||||
} afe_aec_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the AEC structure.
|
||||
*
|
||||
* @warning Currently only support 1 microphone channel and 1 playback channe.
|
||||
* If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
|
||||
* @brief Creates an instance to the AEC structure.
|
||||
*
|
||||
* @warning Currently only support 1 microphone channel and 1 playback channe.
|
||||
* If input has multiple microphone channels and playback channels, just the first microphone channel and playback
|
||||
* channel will be selected.
|
||||
*
|
||||
* The input format, same as afe config:
|
||||
* M to represent the microphone channel
|
||||
@ -37,7 +36,8 @@ typedef struct {
|
||||
*
|
||||
* @param input_format The input format
|
||||
* @param filter_length The length of filter. The larger the filter, the higher the CPU loading.
|
||||
* Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
|
||||
* Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for
|
||||
* esp32c5.
|
||||
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
|
||||
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
*
|
||||
@ -45,17 +45,17 @@ typedef struct {
|
||||
*/
|
||||
afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
|
||||
*
|
||||
*
|
||||
* @param inst The instance of AEC.
|
||||
* @param indata Input audio data, format is define by input_format. Note indata will be modified in function call.
|
||||
* @param outdata Returns near-end signal with echo removed.
|
||||
* @param indata Input audio data, format is define by input_format.
|
||||
* @param outdata Near-end signal with echo removed. outdata must be 16-bit aligned.
|
||||
* please use heap_caps_aligned_calloc(16, n, size, caps) to allocate an aligned chunk of memory
|
||||
|
||||
* @return The bytes of outdata.
|
||||
*/
|
||||
size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
|
||||
size_t afe_aec_process(afe_aec_handle_t *handel, const int16_t *indata, int16_t *outdata);
|
||||
|
||||
/**
|
||||
* @brief Get frame size of AEC (the samples of one frame)
|
||||
@ -64,7 +64,6 @@ size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outda
|
||||
*/
|
||||
int afe_aec_get_chunksize(afe_aec_handle_t *handle);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Free the AEC instance
|
||||
*
|
||||
|
||||
@ -1,9 +1,15 @@
|
||||
#pragma once
|
||||
#include "esp_aec.h"
|
||||
#include "esp_agc.h"
|
||||
#include "esp_nsn_models.h"
|
||||
#include "esp_vad.h"
|
||||
#include "esp_vadn_models.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#include "esp_wn_models.h"
|
||||
#include "model_path.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdint.h"
|
||||
#include "stdlib.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@ -27,7 +33,8 @@ typedef enum {
|
||||
// Set AFE type
|
||||
typedef enum {
|
||||
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
|
||||
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
|
||||
AFE_TYPE_VC = 1, // Voice communication scenarios, 16KHz input, including nonlinear noise suppression
|
||||
AFE_TYPE_VC_8K = 2, // Voice communication scenarios, 8KHz input, note that the input data must be 8KHz
|
||||
} afe_type_t;
|
||||
|
||||
typedef enum {
|
||||
@ -62,8 +69,220 @@ typedef enum {
|
||||
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
|
||||
} afe_agc_mode_t;
|
||||
|
||||
/**
|
||||
* @brief Function to get the debug audio data
|
||||
*
|
||||
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that
|
||||
* avoid blocking for too long.
|
||||
* @param data_size The number of bytes of data.
|
||||
* @returns
|
||||
*/
|
||||
typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);
|
||||
|
||||
typedef enum {
|
||||
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
|
||||
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
|
||||
AFE_DEBUG_HOOK_MAX = 2
|
||||
} afe_debug_hook_type_t;
|
||||
|
||||
typedef struct {
|
||||
afe_debug_hook_type_t hook_type; // debug type of hook
|
||||
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
|
||||
} afe_debug_hook_t;
|
||||
|
||||
typedef struct {
|
||||
/********** AEC(Acoustic Echo Cancellation) **********/
|
||||
bool aec_init; // Whether to init aec
|
||||
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
|
||||
int aec_filter_length; // The filter length of aec
|
||||
|
||||
/********** SE(Speech Enhancement, microphone array processing) **********/
|
||||
bool se_init; // Whether to init se
|
||||
|
||||
/********** NS(Noise Suppression) **********/
|
||||
bool ns_init; // Whether to init ns
|
||||
char *ns_model_name; // Model name of ns
|
||||
afe_ns_mode_t afe_ns_mode; // Model mode of ns
|
||||
|
||||
/********** VAD(Voice Activity Detection) **********/
|
||||
bool vad_init; // Whether to init vad
|
||||
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
|
||||
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
|
||||
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
|
||||
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
|
||||
// 1000 ms
|
||||
int vad_delay_ms; // The delay of the first speech frame in ms, default: 128 ms
|
||||
// If you find vad cache can not cover all speech, please increase this value.
|
||||
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
|
||||
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
|
||||
|
||||
/********** WakeNet(Wake Word Engine) **********/
|
||||
bool wakenet_init;
|
||||
char *wakenet_model_name; // The model name of wakenet 1
|
||||
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
|
||||
det_mode_t wakenet_mode; // The mode of wakenet
|
||||
|
||||
/********** AGC(Automatic Gain Control) **********/
|
||||
bool agc_init; // Whether to init agc
|
||||
afe_agc_mode_t
|
||||
agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
|
||||
int agc_compression_gain_db; // Compression gain in dB (default 9)
|
||||
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default 3, means target level is -3 dBFS)
|
||||
|
||||
/********** General AFE(Audio Front End) parameter **********/
|
||||
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
|
||||
afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
|
||||
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
|
||||
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
|
||||
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
|
||||
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
|
||||
// directly on the output amplitude: out_linear_gain * amplitude.
|
||||
bool debug_init;
|
||||
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
|
||||
// otherwise, select channel number by wakenet
|
||||
} afe_config_t;
|
||||
|
||||
/**
|
||||
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
|
||||
* on the chip target and input format. You can manually fine-tune it after creating the configuration
|
||||
*
|
||||
* The input format:
|
||||
* M to represent the microphone channel
|
||||
* R to represent the playback reference channel
|
||||
* N to represent an unknown or unused channel
|
||||
*
|
||||
* For example, input_format="MMNR" indicates that the input data consists of four channels,
|
||||
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
|
||||
*
|
||||
* @param input_format The input format
|
||||
* @param models Models from partition, which is configured by Kconfig
|
||||
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
|
||||
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
*
|
||||
* @return afe_config_t* The default config of afe
|
||||
*/
|
||||
afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
|
||||
|
||||
/**
|
||||
* @brief Check AFE configuration and make sure it is correct.
|
||||
*
|
||||
* @warning If there is a configuration conflict, this function will modify some parameters.
|
||||
* The guiding behind these modifications is to maintain the highest performance of the output audio and results.
|
||||
* And remove the conflict between different algorithms.
|
||||
*
|
||||
* For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
|
||||
* If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
|
||||
*
|
||||
* @param afe_config Input AFE config
|
||||
*
|
||||
* @return afe_config_t* The modified AFE config
|
||||
*/
|
||||
afe_config_t *afe_config_check(afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Parse input format
|
||||
*
|
||||
* @param input_format The input format, same with afe_config_init() function
|
||||
* @param pcm_config The pcm config
|
||||
*
|
||||
* @return true if the input format is parsed successfully, otherwise false
|
||||
*/
|
||||
bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);
|
||||
|
||||
/**
|
||||
* @brief Parse I2S input data
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param mic_data The output microphone data
|
||||
* @param ref_data The output playback reference data
|
||||
* @param pcm_config The pcm config
|
||||
*
|
||||
*/
|
||||
void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);
|
||||
|
||||
/**
|
||||
* @brief Parse input data, from interleaved arrangement to contiguous arrangement
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param channel_num The channel number of data
|
||||
* @param out_data The output data
|
||||
*
|
||||
*/
|
||||
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Format input data, from contiguous arrangement to interleaved arrangement
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param channel_num The channel number of data
|
||||
* @param out_data The output data
|
||||
*
|
||||
*/
|
||||
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Adjust the gain of input data
|
||||
*
|
||||
* @warning the input data will be modified inplace.
|
||||
*
|
||||
* @param data The input audio data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param factor The gain factor
|
||||
*
|
||||
* @return int16_t* The output audio data
|
||||
*/
|
||||
int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);
|
||||
|
||||
/**
|
||||
* @brief Adjust the gain of input data
|
||||
*
|
||||
* @warning the input data will be modified inplace.
|
||||
*
|
||||
* @param in_data The input audio data
|
||||
* @param in_frame_size Input data frame size of input
|
||||
* @param channel_num The channel number of input data, which is same as output data
|
||||
* @param out_data The output audio data
|
||||
* @param out_frame_size Onput data frame size of input
|
||||
*
|
||||
*/
|
||||
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);
|
||||
|
||||
/**
|
||||
* @brief Copy the afe config
|
||||
*
|
||||
* @param dst_config The destination afe config
|
||||
* @param src_config The source afe config
|
||||
*
|
||||
* @return The destination afe config
|
||||
*/
|
||||
afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
|
||||
|
||||
/**
|
||||
* @brief Print the afe config
|
||||
*
|
||||
* @param afe_config The afe config
|
||||
*/
|
||||
void afe_config_print(const afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Allocate afe config
|
||||
*
|
||||
* @return The afe config pointer
|
||||
*/
|
||||
afe_config_t *afe_config_alloc();
|
||||
|
||||
/**
|
||||
* @brief Free afe config
|
||||
*
|
||||
* @param afe_config The afe config pointer
|
||||
*/
|
||||
void afe_config_free(afe_config_t *afe_config);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
48
include/esp32c5/esp_afe_doa.h
Normal file
48
include/esp32c5/esp_afe_doa.h
Normal file
@ -0,0 +1,48 @@
|
||||
#ifndef _ESP_AFE_DOA_H_
|
||||
#define _ESP_AFE_DOA_H_
|
||||
|
||||
#include "esp_doa.h"
|
||||
#include "esp_afe_config.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
doa_handle_t *doa_handle;
|
||||
afe_pcm_config_t pcm_config;
|
||||
int16_t *leftdata;
|
||||
int16_t *rightdata;
|
||||
int frame_size;
|
||||
} afe_doa_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Initialize SRP-PHAT processor
|
||||
* @param input_format The input format
|
||||
* @param fs Sampling rate (Hz), e.g., 16000
|
||||
* @param resolution Angular search resolution (degrees), e.g., 20
|
||||
* @param d_mics Microphone spacing (meters), e.g., 0.06
|
||||
* @param input_timedate_samples input timedate samples, e.g., 1024
|
||||
* @return Initialized doa_handle_t object pointer, Recommend using the above configuration for better performance
|
||||
*/
|
||||
afe_doa_handle_t *afe_doa_create(const char *input_format, int fs, float resolution, float d_mics, int input_timedate_samples);
|
||||
/**
|
||||
* @brief Process audio frame for direction estimation
|
||||
* @param handle doa_handle_t instance pointer
|
||||
* @param indata Input audio data, format is define by input_format.
|
||||
* @return Estimated sound direction in degrees, e.g., 0-180
|
||||
*/
|
||||
float afe_doa_process(afe_doa_handle_t *handle, const int16_t *indata);
|
||||
/**
|
||||
* @brief Release all allocated resources
|
||||
* @param doa doa_handle_t instance pointer to be freed
|
||||
*/
|
||||
void afe_doa_destroy(afe_doa_handle_t *handle);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ESP_AFE_DOA_H_ */
|
||||
237
include/esp32c5/esp_afe_sr_iface.h
Normal file
237
include/esp32c5/esp_afe_sr_iface.h
Normal file
@ -0,0 +1,237 @@
|
||||
#pragma once
|
||||
#include "esp_afe_config.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdint.h"
|
||||
#include "stdlib.h"
|
||||
#include "freertos/FreeRTOS.h"
|
||||
#include "freertos/task.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// AFE: Audio Front-End
|
||||
// SR: Speech Recognition
|
||||
// afe_sr/AFE_SR: the audio front-end for speech recognition
|
||||
|
||||
// Opaque AFE_SR data container
|
||||
typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
|
||||
|
||||
/**
|
||||
* @brief The state of vad
|
||||
*/
|
||||
typedef enum {
|
||||
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
|
||||
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
|
||||
} afe_vad_state_t;
|
||||
|
||||
/**
|
||||
* @brief The result of fetch function
|
||||
*/
|
||||
typedef struct afe_fetch_result_t {
|
||||
int16_t *data; // the target channel data of audio.
|
||||
int data_size; // the size of data. The unit is byte.
|
||||
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
|
||||
// audio that was truncated.
|
||||
int vad_cache_size; // the size of vad_cache. The unit is byte.
|
||||
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
|
||||
// (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
|
||||
// wakenet(about 1.5s), otherwise is the frame length.
|
||||
wakenet_state_t wakeup_state; // the value is wakenet_state_t
|
||||
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
|
||||
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
|
||||
// start from 1.
|
||||
vad_state_t vad_state; // the value is afe_vad_state_t
|
||||
int trigger_channel_id; // the channel index of output
|
||||
int wake_word_length; // the length of wake word. The unit is the number of samples.
|
||||
int ret_value; // the return state of fetch function
|
||||
int16_t *raw_data; // the multi-channel output data of audio.
|
||||
int raw_data_channels; // the channel number of raw data
|
||||
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is buzy.
|
||||
void *reserved; // reserved for future use
|
||||
} afe_fetch_result_t;
|
||||
|
||||
/**
|
||||
* @brief Function to initialze a AFE_SR instance
|
||||
*
|
||||
* @param afe_config The config of AFE_SR
|
||||
* @returns Handle to the AFE_SR data
|
||||
*/
|
||||
typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of each channel samples per frame that need to be passed to the function
|
||||
*
|
||||
* Every speech enhancement AFE_SR processes a certain number of samples at the same time. This function
|
||||
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of samples to feed the fetch function
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the channel number
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of total channels
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the function
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the AFE_SR
|
||||
*
|
||||
* @Warning The input data should be arranged in the format of channel interleaving.
|
||||
* The last channel is reference signal if it has reference data.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
*
|
||||
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
|
||||
* `get_feed_chunksize`.
|
||||
* @return The size of input
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);
|
||||
|
||||
/**
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR
|
||||
*
|
||||
* @Warning The output is single channel data, no matter how many channels the input is.
|
||||
* Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
|
||||
* audio can be queried by the `get_fetch_chunksize`.)
|
||||
*/
|
||||
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
|
||||
*
|
||||
* @Warning The output is single channel data, no matter how many channels the input is.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result.
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
|
||||
* audio can be queried by the `get_fetch_chunksize`.)
|
||||
*/
|
||||
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
|
||||
|
||||
/**
|
||||
* @brief reset ringbuf of AFE.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Set wakenet detection threshold
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param index The wakenet index, just support 1: wakenet1 or 2: wakenet2
|
||||
* @param threshold The wakenet detection threshold, the value is between 0.4 and 0.9999.
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_set_wakenet_threshold_t)(esp_afe_sr_data_t *afe, int index, float threshold);
|
||||
|
||||
/**
|
||||
* @brief Reset wakenet detection threshold to inital state
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param index The wakenet index, just support 1: wakenet1 or 2: wakenet2
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_reset_wakenet_threshold_t)(esp_afe_sr_data_t *afe, int index);
|
||||
|
||||
/**
|
||||
* @brief Reset one function/module/algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Disable one function/module/algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 0: disabled, 1: enabled
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Enable one function/module/algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 0: disabled, 1: enabled
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Print all functions/modules/algorithms pipeline.
|
||||
* The pipeline is the order of the functions/modules/algorithms.
|
||||
* The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output]
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
*/
|
||||
typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Destroy a AFE_SR instance
|
||||
*
|
||||
* @param afe AFE_SR object to destroy
|
||||
*/
|
||||
typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a AFE_SR.
|
||||
*/
|
||||
typedef struct {
|
||||
esp_afe_sr_iface_op_create_from_config_t create_from_config;
|
||||
esp_afe_sr_iface_op_feed_t feed;
|
||||
esp_afe_sr_iface_op_fetch_t fetch;
|
||||
esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay;
|
||||
esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
|
||||
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_afe_sr_iface_op_set_wakenet_threshold_t set_wakenet_threshold;
|
||||
esp_afe_sr_iface_op_reset_wakenet_threshold_t reset_wakenet_threshold;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_wakenet;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_wakenet;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_aec;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_aec;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_se;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_se;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_vad;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_vad;
|
||||
esp_afe_sr_iface_op_reset_op_t reset_vad;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_ns;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_ns;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_agc;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_agc;
|
||||
esp_afe_sr_iface_op_print_pipeline_t print_pipeline;
|
||||
esp_afe_sr_iface_op_destroy_t destroy;
|
||||
} esp_afe_sr_iface_t;
|
||||
|
||||
// struct is used to store the AFE handle and data for the AFE task
|
||||
typedef struct {
|
||||
esp_afe_sr_data_t *afe_data;
|
||||
esp_afe_sr_iface_t *afe_handle;
|
||||
TaskHandle_t feed_task;
|
||||
TaskHandle_t fetch_task;
|
||||
} afe_task_into_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
13
include/esp32c5/esp_afe_sr_models.h
Normal file
13
include/esp32c5/esp_afe_sr_models.h
Normal file
@ -0,0 +1,13 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "esp_afe_sr_iface.h"
|
||||
|
||||
esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
47
include/esp32c5/esp_agc.h
Normal file
47
include/esp32c5/esp_agc.h
Normal file
@ -0,0 +1,47 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_AGC_H_
|
||||
#define _ESP_AGC_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
////all positive value is valid, negective is error
|
||||
typedef enum {
|
||||
ESP_AGC_SUCCESS = 0, ////success
|
||||
ESP_AGC_FAIL = -1, ////agc fail
|
||||
ESP_AGC_SAMPLE_RATE_ERROR = -2, ///sample rate can be only 8khz, 16khz, 32khz
|
||||
ESP_AGC_FRAME_SIZE_ERROR = -3, ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size
|
||||
} ESP_AGE_ERR;
|
||||
|
||||
typedef enum {
|
||||
AGC_MODE_SR = -1, // Bypass WEBRTC AGC
|
||||
AGC_MODE_0 = 0, // Only saturation protection
|
||||
AGC_MODE_1 = 1, // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
|
||||
AGC_MODE_2 = 2, // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
|
||||
AGC_MODE_3 = 3, // Fixed Digital Gain [compressionGaindB (default 8 dB)]
|
||||
} agc_mode_t;
|
||||
|
||||
void *esp_agc_open(agc_mode_t agc_mode, int sample_rate);
|
||||
void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs);
|
||||
int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate);
|
||||
void esp_agc_close(void *agc_handle);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // _ESP_AGC_H_
|
||||
41
include/esp32c5/esp_doa.h
Normal file
41
include/esp32c5/esp_doa.h
Normal file
@ -0,0 +1,41 @@
|
||||
#ifndef _ESP_DOA_H_
|
||||
#define _ESP_DOA_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct doa_handle_t doa_handle_t;
|
||||
/**
|
||||
* @brief Initialize SRP-PHAT processor
|
||||
* @param fs Sampling rate (Hz), e.g., 16000
|
||||
* @param resolution Angular search resolution (degrees), e.g., 20
|
||||
* @param d_mics Microphone spacing (meters), e.g., 0.06
|
||||
* @param input_timedate_samples input timedate samples, e.g., 1024
|
||||
* @return Initialized doa_handle_t object pointer, Recommend using the above configuration for better performance
|
||||
*/
|
||||
doa_handle_t *esp_doa_create(int fs, float resolution, float d_mics, int input_timedate_samples);
|
||||
|
||||
/**
|
||||
* @brief Release all allocated resources
|
||||
* @param doa doa_handle_t instance pointer to be freed
|
||||
*/
|
||||
void esp_doa_destroy(doa_handle_t *doa);
|
||||
|
||||
/**
|
||||
* @brief Process audio frame for direction estimation
|
||||
* @param doa doa_handle_t instance pointer
|
||||
* @param left Left channel 16-bit PCM data
|
||||
* @param right Right channel 16-bit PCM data
|
||||
* @return Estimated sound direction in degrees, e.g., 0-180
|
||||
*/
|
||||
float esp_doa_process(doa_handle_t *doa, int16_t* left, int16_t* right);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ESP_DOA_H_ */
|
||||
93
include/esp32c5/esp_mase.h
Normal file
93
include/esp32c5/esp_mase.h
Normal file
@ -0,0 +1,93 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_MASE_H_
|
||||
#define _ESP_MASE_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define MASE_SAMPLE_RATE 16000 // Supports 16kHz only
|
||||
#define MASE_FRAME_SIZE 16 // Supports 16ms only
|
||||
#define MASE_MIC_DISTANCE 65 // According to physical design of mic-array
|
||||
|
||||
/**
|
||||
* @brief Sets mic-array type, currently 2-mic line array and 3-mic circular array
|
||||
* are supported.
|
||||
*/
|
||||
typedef enum {
|
||||
TWO_MIC_LINE = 0,
|
||||
THREE_MIC_CIRCLE = 1
|
||||
} mase_mic_array_type_t;
|
||||
|
||||
/**
|
||||
* @brief Sets operating mode, supporting normal mode and wake-up enhancement mode
|
||||
*/
|
||||
typedef enum {
|
||||
NORMAL_ENHANCEMENT_MODE = 0,
|
||||
WAKE_UP_ENHANCEMENT_MODE = 1
|
||||
} mase_op_mode_t;
|
||||
|
||||
typedef void* mase_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the MASE structure.
|
||||
*
|
||||
* @param sample_rate The sampling frequency (Hz) must be 16000.
|
||||
*
|
||||
* @param frame_size The length of the audio processing must be 16ms.
|
||||
*
|
||||
* @param array_type '0' for 2-mic line array and '1' for 3-mic circular array.
|
||||
*
|
||||
* @param mic_distance The distance between neiboring microphones in mm.
|
||||
*
|
||||
* @param operating_mode '0' for normal mode and '1' for wake-up enhanced mode.
|
||||
*
|
||||
* @param filter_strength Strengh of the mic-array speech enhancement, must be 0, 1, 2 or 3.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: An instance of MASE
|
||||
*/
|
||||
mase_handle_t mase_create(int fs, int frame_size, int array_type, float mic_distance, int operating_mode, int filter_strength);
|
||||
|
||||
/**
|
||||
* @brief Performs mic array processing for one frame.
|
||||
*
|
||||
* @param inst The instance of MASE.
|
||||
*
|
||||
* @param in An array of 16-bit signed audio samples from mic.
|
||||
*
|
||||
* @param dsp_out Returns enhanced signal.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void mase_process(mase_handle_t st, int16_t *in, int16_t *dsp_out);
|
||||
|
||||
/**
|
||||
* @brief Free the MASE instance
|
||||
*
|
||||
* @param inst The instance of MASE.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void mase_destory(mase_handle_t st);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@ -12,7 +12,7 @@ esp_mfcc_opts_t *get_mfcc_opts_wn9();
|
||||
/**
|
||||
* @brief Return basic opts used in wakenet9s
|
||||
**/
|
||||
esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
|
||||
esp_mfcc_opts_t *get_mfcc_opts(const char *win_type, bool use_power, int winstep_ms, int winlen_ms, int nfilter);
|
||||
|
||||
/**
|
||||
* @brief Return basic opts for default kaldifeat
|
||||
|
||||
223
include/esp32c5/esp_mn_iface.h
Normal file
223
include/esp32c5/esp_mn_iface.h
Normal file
@ -0,0 +1,223 @@
|
||||
#pragma once
|
||||
#include "stdint.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define ESP_MN_RESULT_MAX_NUM 5
|
||||
#define ESP_MN_MAX_PHRASE_NUM 400
|
||||
#define ESP_MN_MAX_PHRASE_LEN 63
|
||||
#define ESP_MN_MIN_PHRASE_LEN 2
|
||||
|
||||
#define ESP_MN_PREFIX "mn"
|
||||
#define ESP_MN_ENGLISH "en"
|
||||
#define ESP_MN_CHINESE "cn"
|
||||
|
||||
typedef enum {
|
||||
ESP_MN_STATE_DETECTING = 0, // detecting
|
||||
ESP_MN_STATE_DETECTED = 1, // detected
|
||||
ESP_MN_STATE_TIMEOUT = 2, // time out
|
||||
} esp_mn_state_t;
|
||||
|
||||
//Set multinet loading mode
|
||||
//The memory comsumption is decreased with increasing mode,
|
||||
//As a consequence also the CPU loading rate goes up
|
||||
typedef enum {
|
||||
ESP_MN_LOAD_FROM_PSRAM = 0, // Load all weights from PSRAM. Fastest computation with Maximum memory consumption
|
||||
ESP_MN_LOAD_FROM_PSRAM_FLASH = 1, // Load some weights from PSRAM and laod the rest from FLASH (default)
|
||||
ESP_MN_LOAD_FROM_FLASH = 2, // Load more weights from FLASH. Minimum memory consumption with slowest computation
|
||||
} esp_mn_loader_mode_t;
|
||||
|
||||
typedef enum {
|
||||
ESP_MN_GREEDY_SEARCH = 0, // greedy search
|
||||
ESP_MN_BEAM_SEARCH = 1, // beam search
|
||||
ESP_MN_BEAM_SEARCH_WITH_FST = 2, // beam search with trie language model
|
||||
} esp_mn_search_method_t;
|
||||
|
||||
typedef enum {
|
||||
CHINESE_ID = 1, // Chinese language
|
||||
ENGLISH_ID = 2, // English language
|
||||
} language_id_t;
|
||||
|
||||
// Return all possible recognition results
|
||||
typedef struct{
|
||||
esp_mn_state_t state;
|
||||
int num; // The number of phrase in list, num<=5. When num=0, no phrase is recognized.
|
||||
int command_id[ESP_MN_RESULT_MAX_NUM]; // The list of command id.
|
||||
int phrase_id[ESP_MN_RESULT_MAX_NUM]; // The list of phrase id.
|
||||
float prob[ESP_MN_RESULT_MAX_NUM]; // The list of probability.
|
||||
char string[256];
|
||||
} esp_mn_results_t;
|
||||
|
||||
typedef struct {
|
||||
char *string; // command string
|
||||
char *phonemes; // command phonemes, if applicable
|
||||
int16_t command_id; // the command id
|
||||
float threshold; // trigger threshold, default: 0
|
||||
int16_t *wave; // prompt wave data of the phrase
|
||||
} esp_mn_phrase_t;
|
||||
|
||||
typedef struct _mn_node_ {
|
||||
esp_mn_phrase_t *phrase;
|
||||
struct _mn_node_ *next;
|
||||
} esp_mn_node_t;
|
||||
|
||||
typedef struct{
|
||||
int16_t num; // The number of error phrases, which can not added into model
|
||||
esp_mn_phrase_t **phrases; // The array of error phrase pointer
|
||||
} esp_mn_error_t;
|
||||
|
||||
/**
|
||||
* @brief Initialze a model instance with specified model name.
|
||||
*
|
||||
* @param model_name The wakenet model name.
|
||||
* @param duration The duration (ms) to trigger the timeout
|
||||
*
|
||||
* @returns Handle to the model data.
|
||||
*/
|
||||
typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const char *model_name, int duration);
|
||||
|
||||
/**
|
||||
* @brief Switch multinet mode to change memory consumption and CPU loading
|
||||
*
|
||||
* @warning Just Support multinet6 or later versions
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param mode The multinet loader mode
|
||||
*
|
||||
* @returns Handle to the model data.
|
||||
*/
|
||||
typedef model_iface_data_t* (*esp_mn_iface_op_switch_loader_mode_t)(model_iface_data_t *model, esp_mn_loader_mode_t mode);
|
||||
|
||||
/**
|
||||
* @brief Callback function type to fetch the amount of samples that need to be passed to the detect function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the same time. This function
|
||||
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Callback function type to fetch the number of frames recognized by the command word
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The number of the frames recognized by the command word
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_get_samp_chunknum_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Set the detection threshold to manually abjust the probability
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param det_treshold The threshold to trigger speech commands, the range of det_threshold is 0.0~0.9999
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the detect function
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the language of model
|
||||
*
|
||||
* @param model The language name
|
||||
* @return Language name string defined in esp_mn_models.h, eg: ESP_MN_CHINESE, ESP_MN_ENGLISH
|
||||
*/
|
||||
typedef char * (*esp_mn_iface_op_get_language_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the speech recognition model and detect if there is a speech command found.
|
||||
*
|
||||
* @param model The model object to query.
|
||||
* @param samples An array of 16-bit signed audio samples. The array size used can be queried by the
|
||||
* get_samp_chunksize function.
|
||||
* @return The state of multinet
|
||||
*/
|
||||
typedef esp_mn_state_t (*esp_mn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
|
||||
|
||||
/**
|
||||
* @brief Destroy a speech commands recognition model
|
||||
*
|
||||
* @param model The Model object to destroy
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_destroy_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get recognition results
|
||||
*
|
||||
* @param model The Model object to query
|
||||
*
|
||||
* @return The current results.
|
||||
*/
|
||||
typedef esp_mn_results_t* (*esp_mn_iface_op_get_results_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Open the log print
|
||||
*
|
||||
* @param model_data The model object to query.
|
||||
*
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_open_log_t)(model_iface_data_t *model_data);
|
||||
|
||||
/**
|
||||
* @brief Clean all status of model
|
||||
*
|
||||
* @param model_data The model object to query.
|
||||
*
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_clean_t)(model_iface_data_t *model_data);
|
||||
|
||||
/**
|
||||
* @brief Set the speech commands by mn_command_root
|
||||
*
|
||||
* @param model_data The model object to query.
|
||||
* @param mn_command_root The speech commands link.
|
||||
* @return The error phrase id info.
|
||||
*/
|
||||
typedef esp_mn_error_t* (*esp_wn_iface_op_set_speech_commands)(model_iface_data_t *model_data, esp_mn_node_t *mn_command_root);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Print out current commands in fst, note the ones "added" but not "updated" will not be shown here
|
||||
*
|
||||
* @param model_data The model object to query
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_print_active_speech_commands)(model_iface_data_t *model_data);
|
||||
|
||||
/**
|
||||
* @brief Check if input string can be tokenized
|
||||
*
|
||||
* @param model_data The model object to query
|
||||
* @param str The input string
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_check_speech_command)(model_iface_data_t *model_data, const char *str);
|
||||
|
||||
typedef struct {
|
||||
esp_mn_iface_op_create_t create;
|
||||
esp_mn_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_mn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
||||
esp_mn_iface_op_get_samp_chunknum_t get_samp_chunknum;
|
||||
esp_mn_iface_op_set_det_threshold_t set_det_threshold;
|
||||
esp_mn_iface_op_get_language_t get_language;
|
||||
esp_mn_iface_op_detect_t detect;
|
||||
esp_mn_iface_op_destroy_t destroy;
|
||||
esp_mn_iface_op_get_results_t get_results;
|
||||
esp_mn_iface_op_open_log_t open_log;
|
||||
esp_mn_iface_op_clean_t clean;
|
||||
esp_wn_iface_op_set_speech_commands set_speech_commands;
|
||||
esp_mn_iface_op_switch_loader_mode_t switch_loader_mode;
|
||||
esp_mn_iface_op_print_active_speech_commands print_active_speech_commands;
|
||||
esp_mn_iface_op_check_speech_command check_speech_command;
|
||||
} esp_mn_iface_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
66
include/esp32c5/esp_mn_models.h
Normal file
66
include/esp32c5/esp_mn_models.h
Normal file
@ -0,0 +1,66 @@
|
||||
#pragma once
|
||||
#include "esp_mn_iface.h"
|
||||
|
||||
//Contains declarations of all available speech recognion models. Pair this up with the right coefficients and you have a model that can recognize
|
||||
//a specific phrase or word.
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
/**
|
||||
* @brief Get the multinet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of multinet
|
||||
*/
|
||||
esp_mn_iface_t *esp_mn_handle_from_name(char *model_name);
|
||||
|
||||
/**
|
||||
* @brief Get the multinet language from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The language of multinet
|
||||
*/
|
||||
char *esp_mn_language_from_name(char *model_name);
|
||||
|
||||
/*
|
||||
Configure wake word to use based on what's selected in menuconfig.
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_SR_MN_CN_MULTINET2_SINGLE_RECOGNITION
|
||||
#include "multinet2_ch.h"
|
||||
#define MULTINET_COEFF get_coeff_multinet2_ch
|
||||
#define MULTINET_MODEL_NAME "mn2_cn"
|
||||
|
||||
#else
|
||||
#define MULTINET_COEFF "COEFF_NULL"
|
||||
#define MULTINET_MODEL_NAME "NULL"
|
||||
#endif
|
||||
|
||||
|
||||
/* example
|
||||
|
||||
static const esp_mn_iface_t *multinet = &MULTINET_MODEL;
|
||||
|
||||
//Initialize MultiNet model data
|
||||
model_iface_data_t *model_data = multinet->create(&MULTINET_COEFF);
|
||||
add_speech_commands(multinet, model_data);
|
||||
|
||||
//Set parameters of buffer
|
||||
int audio_chunksize=model->get_samp_chunksize(model_data);
|
||||
int frequency = model->get_samp_rate(model_data);
|
||||
int16_t *buffer=malloc(audio_chunksize*sizeof(int16_t));
|
||||
|
||||
//Detect
|
||||
int r=model->detect(model_data, buffer);
|
||||
if (r>0) {
|
||||
printf("Detection triggered output %d.\n", r);
|
||||
}
|
||||
|
||||
//Destroy model
|
||||
model->destroy(model_data)
|
||||
|
||||
*/
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
86
include/esp32c5/esp_ns.h
Normal file
86
include/esp32c5/esp_ns.h
Normal file
@ -0,0 +1,86 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_NS_H_
|
||||
#define _ESP_NS_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define NS_USE_SPIARM 0
|
||||
#define NS_FRAME_LENGTH_MS 10 //Supports 10ms, 20ms, 30ms
|
||||
|
||||
/**
|
||||
* The Sampling frequency (Hz) must be 16000Hz
|
||||
*/
|
||||
|
||||
typedef void* ns_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the NS structure.
|
||||
*
|
||||
* @param frame_length The length of the audio processing can be 10ms, 20ms, 30ms.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of NS
|
||||
*/
|
||||
ns_handle_t ns_create(int frame_length);
|
||||
|
||||
/**
|
||||
* @brief Creates an instance of the more powerful noise suppression algorithm.
|
||||
*
|
||||
* @warning frame_length only supports be 10 ms.
|
||||
*
|
||||
* @param frame_length The length of the audio processing can only be 10ms.
|
||||
* @param mode 0: Mild, 1: Medium, 2: Aggressive
|
||||
* @param sample_rate The sample rate of the audio.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of NS
|
||||
*/
|
||||
ns_handle_t ns_pro_create(int frame_length, int mode, int sample_rate);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the NS and get the audio stream after Noise suppression.
|
||||
*
|
||||
* @param inst The instance of NS.
|
||||
*
|
||||
* @param indata An array of 16-bit signed audio samples.
|
||||
*
|
||||
* @param outdata An array of 16-bit signed audio samples after noise suppression.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void ns_process(ns_handle_t inst, int16_t *indata, int16_t *outdata);
|
||||
|
||||
/**
|
||||
* @brief Free the NS instance
|
||||
*
|
||||
* @param inst The instance of NS.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void ns_destroy(ns_handle_t inst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //_ESP_NS_H_
|
||||
64
include/esp32c5/esp_nsn_iface.h
Normal file
64
include/esp32c5/esp_nsn_iface.h
Normal file
@ -0,0 +1,64 @@
|
||||
#pragma once
|
||||
#include "stdint.h"
|
||||
|
||||
//Opaque model data container
|
||||
typedef struct esp_nsn_data_t esp_nsn_data_t;
|
||||
|
||||
|
||||
/**
|
||||
* @brief Easy function type to initialze a model instance
|
||||
*
|
||||
* @param model_name The name of the model instance
|
||||
* @returns Handle to the model data
|
||||
*/
|
||||
typedef esp_nsn_data_t* (*esp_nsn_iface_op_create_t)(char *model_name);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of samples that need to be passed to the process function
|
||||
*
|
||||
* Every noise suppression model processes a certain number of samples at the same time. This function
|
||||
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the process function
|
||||
*/
|
||||
typedef int (*esp_nsn_iface_op_get_samp_chunksize_t)(esp_nsn_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the noise suppression model and get data after process.
|
||||
*
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param in_data An array of 16-bit signed audio samples. The array size used can be queried by the
|
||||
* get_samp_chunksize function.
|
||||
* @param out_data An array of 16-bit signed audio samples after process.
|
||||
* @return The state of return.
|
||||
*/
|
||||
typedef int (*esp_nsn_iface_op_process_t)(esp_nsn_data_t *model, int16_t *in_data, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the process function
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_nsn_iface_op_get_samp_rate_t)(esp_nsn_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Destroy a noise suppression model
|
||||
*
|
||||
* @param model Model object to destroy
|
||||
*/
|
||||
typedef void (*esp_nsn_iface_op_destroy_t)(esp_nsn_data_t *model);
|
||||
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a wake word detection model.
|
||||
*/
|
||||
typedef struct {
|
||||
esp_nsn_iface_op_create_t create;
|
||||
esp_nsn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
||||
esp_nsn_iface_op_process_t process;
|
||||
esp_nsn_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_nsn_iface_op_destroy_t destroy;
|
||||
} esp_nsn_iface_t;
|
||||
17
include/esp32c5/esp_nsn_models.h
Normal file
17
include/esp32c5/esp_nsn_models.h
Normal file
@ -0,0 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#include "esp_nsn_iface.h"
|
||||
|
||||
/*
|
||||
The prefix of nset
|
||||
Now there are nsnet1 and nsnet2
|
||||
*/
|
||||
#define ESP_NSNET_PREFIX "nsnet"
|
||||
|
||||
/**
|
||||
* @brief Get the nsnet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of multinet
|
||||
*/
|
||||
esp_nsn_iface_t *esp_nsnet_handle_from_name(char *model_name);
|
||||
@ -48,7 +48,7 @@ float *esp_win_func_init(char *win_type, float *window_data, int frame_length);
|
||||
|
||||
float *esp_fftr(float *x, int nfft, void *fft_table);
|
||||
|
||||
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
|
||||
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_handle);
|
||||
|
||||
void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
|
||||
|
||||
|
||||
84
include/esp32c5/esp_sr_webrtc.h
Normal file
84
include/esp32c5/esp_sr_webrtc.h
Normal file
@ -0,0 +1,84 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_WEBRTC_H_
|
||||
#define _ESP_WEBRTC_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#include "esp_agc.h"
|
||||
#include "esp_log.h"
|
||||
#include "esp_ns.h"
|
||||
#include "sr_ringbuf.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#include "esp_heap_caps.h"
|
||||
|
||||
typedef struct {
|
||||
void *ns_handle;
|
||||
void *agc_handle;
|
||||
int frame_size;
|
||||
int sample_rate;
|
||||
int16_t *buff;
|
||||
int16_t *out_data;
|
||||
sr_ringbuf_handle_t rb;
|
||||
} webrtc_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance of webrtc.
|
||||
*
|
||||
* @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms.
|
||||
*
|
||||
* @param frame_length_ms The length of the audio processing
|
||||
* @param ns_mode The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive
|
||||
* @param agc_mode The model of AGC
|
||||
* @param agc_gain The gain of AGC. default is 9
|
||||
* @param agc_target_level The target level of AGC. default is -3 dbfs
|
||||
* @param sample_rate The sample rate of the audio.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of webrtc
|
||||
*/
|
||||
webrtc_handle_t *webrtc_create(
|
||||
int frame_length_ms, int ns_mode, agc_mode_t agc_mode, int agc_gain, int agc_target_level, int sample_rate);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression.
|
||||
*
|
||||
* @param handle The instance of NS.
|
||||
* @param in_data An array of 16-bit signed audio samples.
|
||||
* @param out_size The sample size of output data
|
||||
* @param enable_ns Enable noise suppression
|
||||
* @param enable_agc Enable automatic gain control
|
||||
*
|
||||
* @return data after noise suppression
|
||||
*/
|
||||
int16_t *webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc);
|
||||
|
||||
/**
|
||||
* @brief Free the webrtc instance
|
||||
*
|
||||
* @param handle The instance of webrtc.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void webrtc_destroy(webrtc_handle_t *handle);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //_ESP_NS_H_
|
||||
178
include/esp32c5/esp_vad.h
Normal file
178
include/esp32c5/esp_vad.h
Normal file
@ -0,0 +1,178 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_VAD_H_
|
||||
#define _ESP_VAD_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define SAMPLE_RATE_HZ 16000 // Supports 32000, 16000, 8000
|
||||
#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms
|
||||
|
||||
/**
|
||||
* @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
|
||||
* restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
|
||||
*/
|
||||
typedef enum {
|
||||
VAD_MODE_0 = 0, // Normal
|
||||
VAD_MODE_1, // Aggressive
|
||||
VAD_MODE_2, // Very Aggressive
|
||||
VAD_MODE_3, // Very Very Aggressive
|
||||
VAD_MODE_4 // Very Very Very Aggressive
|
||||
} vad_mode_t;
|
||||
|
||||
typedef enum {
|
||||
VAD_SILENCE = 0,
|
||||
VAD_SPEECH = 1,
|
||||
} vad_state_t;
|
||||
|
||||
typedef struct vad_trigger_tag {
|
||||
vad_state_t state;
|
||||
unsigned int min_speech_len;
|
||||
unsigned int noise_len;
|
||||
unsigned int min_noise_len;
|
||||
unsigned int speech_len;
|
||||
} vad_trigger_t;
|
||||
|
||||
#define vad_MAX_LEN INT32_MAX - 1
|
||||
/**
|
||||
* @brief Allocate wakenet trigger
|
||||
*
|
||||
* @param min_speech_len Minimum frame number of speech duration
|
||||
* @param min_noise_len Minimum frame number of noise duration
|
||||
*
|
||||
* @return Trigger pointer
|
||||
**/
|
||||
vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
|
||||
|
||||
/**
|
||||
* @brief Free wakenet trigger
|
||||
**/
|
||||
void vad_trigger_free(vad_trigger_t *trigger);
|
||||
|
||||
/**
|
||||
* @brief Reset wakenet trigger
|
||||
**/
|
||||
void vad_trigger_reset(vad_trigger_t *trigger);
|
||||
|
||||
/**
|
||||
* @brief detect activaty voice by trigger
|
||||
**/
|
||||
vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
|
||||
|
||||
typedef struct {
|
||||
vad_trigger_t *trigger;
|
||||
void *vad_inst;
|
||||
int sample_rate;
|
||||
int frame_size;
|
||||
} vad_handle_with_trigger_t;
|
||||
|
||||
typedef vad_handle_with_trigger_t *vad_handle_t;
|
||||
|
||||
// typedef vad_handle_tag * vad_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the VAD structure.
|
||||
*
|
||||
* @param vad_mode Sets the VAD operating mode.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of VAD
|
||||
*/
|
||||
vad_handle_t vad_create(vad_mode_t vad_mode);
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the VAD structure.
|
||||
*
|
||||
* @param vad_mode Sets the VAD operating mode.
|
||||
* @param sample_rate Sample rate in Hz
|
||||
* @param one_frame_ms Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30.
|
||||
* @param min_speech_ms Minimum speech duration, unit is ms
|
||||
* @param min_noise_ms Minimum noise duration, unit is ms
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of VAD
|
||||
*/
|
||||
vad_handle_t vad_create_with_param(
|
||||
vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
|
||||
*
|
||||
* @param handle The instance of VAD.
|
||||
* @param data An array of 16-bit signed audio samples.
|
||||
* @param sample_rate_hz The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
|
||||
* @param one_frame_ms The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
|
||||
* @return
|
||||
* - VAD_SILENCE if no voice
|
||||
* - VAD_SPEECH if voice is detected
|
||||
*
|
||||
*/
|
||||
vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
|
||||
*
|
||||
* @param handle The instance of VAD.
|
||||
* @param data An array of 16-bit signed audio samples.
|
||||
* @return
|
||||
* - VAD_SILENCE if no voice
|
||||
* - VAD_SPEECH if voice is detected
|
||||
*
|
||||
*/
|
||||
vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
|
||||
|
||||
/**
|
||||
* @brief Reset trigger state as Silence
|
||||
*
|
||||
* @param handle The instance of VAD.
|
||||
*/
|
||||
void vad_reset_trigger(vad_handle_t handle);
|
||||
|
||||
/**
|
||||
* @brief Free the VAD instance
|
||||
*
|
||||
* @param inst The instance of VAD.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void vad_destroy(vad_handle_t inst);
|
||||
|
||||
/*
|
||||
* Programming Guide:
|
||||
*
|
||||
* @code{c}
|
||||
* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to
|
||||
* the VAD structure.
|
||||
*
|
||||
* while (1) {
|
||||
* //Use buffer to receive the audio data from MIC.
|
||||
* vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result.
|
||||
* }
|
||||
*
|
||||
* vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process
|
||||
*
|
||||
* @endcode
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //_ESP_VAD_H_
|
||||
164
include/esp32c5/esp_vadn_iface.h
Normal file
164
include/esp32c5/esp_vadn_iface.h
Normal file
@ -0,0 +1,164 @@
|
||||
#pragma once
|
||||
#include "esp_vad.h"
|
||||
#include "stdint.h"
|
||||
#include "dl_lib_convq_queue.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Opaque model data container
|
||||
typedef struct model_iface_data_t model_iface_data_t;
|
||||
|
||||
// /**
|
||||
// * @brief The state of vad
|
||||
// */
|
||||
// typedef enum {
|
||||
// VAD_NOISE = -1, // Noise
|
||||
// VADNET_STATE_SILENCE = 0, // Silence
|
||||
// VAD_SPEECH = 1 // Speech
|
||||
// } vad_state_t;
|
||||
|
||||
/**
|
||||
* @brief Easy function type to initialze a model instance with a detection mode
|
||||
* and specified model name
|
||||
*
|
||||
* @param model_name The specified model name
|
||||
* @param mode The voice activity detection mode
|
||||
* @param channel_num The number of input audio channels
|
||||
* @param min_speech_ms The minimum duration of speech in ms to trigger vad
|
||||
* speech
|
||||
* @param min_noise_ms The minimum duration of noise in ms to trigger vad
|
||||
* noise
|
||||
* @returns Handle to the model data
|
||||
*/
|
||||
typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)(
|
||||
const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of samples that need to be passed to the detect
|
||||
* function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the
|
||||
* same time. This function can be used to query that amount. Note that the
|
||||
* returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the channel number of samples that need to be passed to the detect
|
||||
* function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the
|
||||
* same time. This function can be used to query that amount. Note that the
|
||||
* returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the detect function
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Set the detection threshold to manually abjust the probability
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param det_treshold The threshold to trigger wake words, the range of
|
||||
* det_threshold is 0.5~0.9999
|
||||
* @return 0: setting failed, 1: setting success
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
|
||||
|
||||
/**
|
||||
* @brief Get the voice activity detection threshold
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @returns the detection threshold
|
||||
*/
|
||||
typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the vad model and detect whether is
|
||||
* voice.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param samples An array of 16-bit signed audio samples. The array size used
|
||||
* can be queried by the get_samp_chunksize function.
|
||||
* @return The index of wake words, return 0 if no wake word is detected, else
|
||||
* the index of the wake words.
|
||||
*/
|
||||
typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
|
||||
|
||||
/**
|
||||
* @brief Feed MFCC of an audio stream to the vad model and detect whether is
|
||||
* voice.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param cq An array of 16-bit MFCC.
|
||||
* @return The index of wake words, return 0 if no wake word is detected, else
|
||||
* the index of the wake words.
|
||||
*/
|
||||
typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq);
|
||||
|
||||
/**
|
||||
* @brief Get MFCC of an audio stream
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return MFCC data
|
||||
*/
|
||||
typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the triggered channel index. Channel index starts from zero
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The channel index
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Clean all states of model
|
||||
*
|
||||
* @param model The model object to query
|
||||
*/
|
||||
typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Destroy a model object
|
||||
*
|
||||
* @param model Model object to destroy
|
||||
*/
|
||||
typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a voice
|
||||
* activity detection model.
|
||||
*/
|
||||
typedef struct {
|
||||
esp_vadn_iface_op_create_t create;
|
||||
esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
||||
esp_vadn_iface_op_get_channel_num_t get_channel_num;
|
||||
esp_vadn_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_vadn_iface_op_set_det_threshold_t set_det_threshold;
|
||||
esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
|
||||
esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
|
||||
esp_vadn_iface_op_detect_t detect;
|
||||
esp_vadn_iface_op_detect_mfcc_t detect_mfcc;
|
||||
esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data;
|
||||
esp_vadn_iface_op_clean_t clean;
|
||||
esp_vadn_iface_op_destroy_t destroy;
|
||||
} esp_vadn_iface_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
22
include/esp32c5/esp_vadn_models.h
Normal file
22
include/esp32c5/esp_vadn_models.h
Normal file
@ -0,0 +1,22 @@
|
||||
#pragma once
|
||||
#include "esp_vadn_iface.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// The prefix of vadnet model name is used to filter all wakenet from availabel models.
|
||||
#define ESP_VADN_PREFIX "vadnet"
|
||||
|
||||
/**
|
||||
* @brief Get the wakenet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of wakenet
|
||||
*/
|
||||
const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@ -29,6 +29,7 @@ typedef enum {
|
||||
DET_MODE_2CH_95 = 3,
|
||||
DET_MODE_3CH_90 = 4,
|
||||
DET_MODE_3CH_95 = 5,
|
||||
DET_MODE_90_COPY_PARAMS = 6, // Aggressive
|
||||
} det_mode_t;
|
||||
|
||||
typedef struct {
|
||||
@ -110,12 +111,21 @@ typedef char* (*esp_wn_iface_op_get_word_name_t)(model_iface_data_t *model, int
|
||||
* @brief Set the detection threshold to manually abjust the probability
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.5~0.9999
|
||||
* @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.4~0.9999
|
||||
* @param word_index The index of wake word
|
||||
* @return 0: setting failed, 1: setting success
|
||||
*/
|
||||
typedef int (*esp_wn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold, int word_index);
|
||||
|
||||
/**
|
||||
* @brief Reset the threshold to its initial state
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return 0: setting failed, 1: setting success
|
||||
*/
|
||||
typedef int (*esp_wn_iface_op_reset_det_threshold_t)(model_iface_data_t *model);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Get the wake word detection threshold of different modes
|
||||
*
|
||||
@ -200,6 +210,7 @@ typedef struct {
|
||||
esp_wn_iface_op_get_word_num_t get_word_num;
|
||||
esp_wn_iface_op_get_word_name_t get_word_name;
|
||||
esp_wn_iface_op_set_det_threshold_t set_det_threshold;
|
||||
esp_wn_iface_op_reset_det_threshold_t reset_det_threshold;
|
||||
esp_wn_iface_op_get_det_threshold_t get_det_threshold;
|
||||
esp_wn_iface_op_get_triggered_channel_t get_triggered_channel;
|
||||
esp_wn_iface_op_get_vol_gain_t get_vol_gain;
|
||||
|
||||
@ -11,7 +11,7 @@ extern "C" {
|
||||
/**
|
||||
* @brief Get the wakenet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of wakenet
|
||||
*/
|
||||
const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
|
||||
@ -19,10 +19,10 @@ const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
|
||||
/**
|
||||
* @brief Get the wake word name from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @param model_name The name of model
|
||||
* @returns The wake word name, like "alexa","hilexin","xiaoaitongxue"
|
||||
*/
|
||||
char* esp_wn_wakeword_from_name(const char *model_name);
|
||||
char *esp_wn_wakeword_from_name(const char *model_name);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
20
include/esp32c5/flite_g2p.h
Normal file
20
include/esp32c5/flite_g2p.h
Normal file
@ -0,0 +1,20 @@
|
||||
#ifndef __FLITE_G2P_H__
|
||||
#define __FLITE_G2P_H__
|
||||
|
||||
typedef struct {
|
||||
int num_phonemes;
|
||||
int phoneme_size;
|
||||
char **phonemes;
|
||||
} flite_g2p_result;
|
||||
|
||||
void flite_g2p_result_free(flite_g2p_result *result);
|
||||
|
||||
flite_g2p_result *flite_g2p_get_result(const char *grapheme);
|
||||
|
||||
void flite_g2p_result_print_string(flite_g2p_result *result, int map_phonemes);
|
||||
|
||||
char *flite_g2p_result_get_string(flite_g2p_result *result, int map_phonemes);
|
||||
|
||||
char *flite_g2p(const char *graphemes, int map_phonemes);
|
||||
|
||||
#endif
|
||||
@ -2,9 +2,8 @@
|
||||
#ifndef _ESP_AFE_AEC_H_
|
||||
#define _ESP_AFE_AEC_H_
|
||||
|
||||
|
||||
#include "esp_afe_config.h"
|
||||
#include "esp_aec.h"
|
||||
#include "esp_afe_config.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
@ -13,19 +12,19 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
aec_handle_t* handle;
|
||||
aec_handle_t *handle;
|
||||
aec_mode_t mode;
|
||||
afe_pcm_config_t pcm_config;
|
||||
int frame_size;
|
||||
int16_t *data;
|
||||
}afe_aec_handle_t;
|
||||
|
||||
int16_t *data;
|
||||
} afe_aec_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the AEC structure.
|
||||
*
|
||||
* @warning Currently only support 1 microphone channel and 1 playback channe.
|
||||
* If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
|
||||
* @brief Creates an instance to the AEC structure.
|
||||
*
|
||||
* @warning Currently only support 1 microphone channel and 1 playback channe.
|
||||
* If input has multiple microphone channels and playback channels, just the first microphone channel and playback
|
||||
* channel will be selected.
|
||||
*
|
||||
* The input format, same as afe config:
|
||||
* M to represent the microphone channel
|
||||
@ -37,7 +36,8 @@ typedef struct {
|
||||
*
|
||||
* @param input_format The input format
|
||||
* @param filter_length The length of filter. The larger the filter, the higher the CPU loading.
|
||||
* Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
|
||||
* Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for
|
||||
* esp32c5.
|
||||
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
|
||||
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
*
|
||||
@ -45,17 +45,17 @@ typedef struct {
|
||||
*/
|
||||
afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
|
||||
*
|
||||
*
|
||||
* @param inst The instance of AEC.
|
||||
* @param indata Input audio data, format is define by input_format. Note indata will be modified in function call.
|
||||
* @param outdata Returns near-end signal with echo removed.
|
||||
* @param indata Input audio data, format is define by input_format.
|
||||
* @param outdata Near-end signal with echo removed. outdata must be 16-bit aligned.
|
||||
* please use heap_caps_aligned_calloc(16, n, size, caps) to allocate an aligned chunk of memory
|
||||
|
||||
* @return The bytes of outdata.
|
||||
*/
|
||||
size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
|
||||
size_t afe_aec_process(afe_aec_handle_t *handel, const int16_t *indata, int16_t *outdata);
|
||||
|
||||
/**
|
||||
* @brief Get frame size of AEC (the samples of one frame)
|
||||
@ -64,7 +64,6 @@ size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outda
|
||||
*/
|
||||
int afe_aec_get_chunksize(afe_aec_handle_t *handle);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Free the AEC instance
|
||||
*
|
||||
|
||||
@ -1,9 +1,15 @@
|
||||
#pragma once
|
||||
#include "esp_aec.h"
|
||||
#include "esp_agc.h"
|
||||
#include "esp_nsn_models.h"
|
||||
#include "esp_vad.h"
|
||||
#include "esp_vadn_models.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#include "esp_wn_models.h"
|
||||
#include "model_path.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdint.h"
|
||||
#include "stdlib.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@ -27,7 +33,8 @@ typedef enum {
|
||||
// Set AFE type
|
||||
typedef enum {
|
||||
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
|
||||
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
|
||||
AFE_TYPE_VC = 1, // Voice communication scenarios, 16KHz input, including nonlinear noise suppression
|
||||
AFE_TYPE_VC_8K = 2, // Voice communication scenarios, 8KHz input, note that the input data must be 8KHz
|
||||
} afe_type_t;
|
||||
|
||||
typedef enum {
|
||||
@ -62,8 +69,220 @@ typedef enum {
|
||||
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
|
||||
} afe_agc_mode_t;
|
||||
|
||||
/**
|
||||
* @brief Function to get the debug audio data
|
||||
*
|
||||
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that
|
||||
* avoid blocking for too long.
|
||||
* @param data_size The number of bytes of data.
|
||||
* @returns
|
||||
*/
|
||||
typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);
|
||||
|
||||
typedef enum {
|
||||
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
|
||||
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
|
||||
AFE_DEBUG_HOOK_MAX = 2
|
||||
} afe_debug_hook_type_t;
|
||||
|
||||
typedef struct {
|
||||
afe_debug_hook_type_t hook_type; // debug type of hook
|
||||
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
|
||||
} afe_debug_hook_t;
|
||||
|
||||
typedef struct {
|
||||
/********** AEC(Acoustic Echo Cancellation) **********/
|
||||
bool aec_init; // Whether to init aec
|
||||
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
|
||||
int aec_filter_length; // The filter length of aec
|
||||
|
||||
/********** SE(Speech Enhancement, microphone array processing) **********/
|
||||
bool se_init; // Whether to init se
|
||||
|
||||
/********** NS(Noise Suppression) **********/
|
||||
bool ns_init; // Whether to init ns
|
||||
char *ns_model_name; // Model name of ns
|
||||
afe_ns_mode_t afe_ns_mode; // Model mode of ns
|
||||
|
||||
/********** VAD(Voice Activity Detection) **********/
|
||||
bool vad_init; // Whether to init vad
|
||||
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
|
||||
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
|
||||
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
|
||||
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
|
||||
// 1000 ms
|
||||
int vad_delay_ms; // The delay of the first speech frame in ms, default: 128 ms
|
||||
// If you find vad cache can not cover all speech, please increase this value.
|
||||
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
|
||||
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
|
||||
|
||||
/********** WakeNet(Wake Word Engine) **********/
|
||||
bool wakenet_init;
|
||||
char *wakenet_model_name; // The model name of wakenet 1
|
||||
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
|
||||
det_mode_t wakenet_mode; // The mode of wakenet
|
||||
|
||||
/********** AGC(Automatic Gain Control) **********/
|
||||
bool agc_init; // Whether to init agc
|
||||
afe_agc_mode_t
|
||||
agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
|
||||
int agc_compression_gain_db; // Compression gain in dB (default 9)
|
||||
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default 3, means target level is -3 dBFS)
|
||||
|
||||
/********** General AFE(Audio Front End) parameter **********/
|
||||
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
|
||||
afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
|
||||
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
|
||||
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
|
||||
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
|
||||
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
|
||||
// directly on the output amplitude: out_linear_gain * amplitude.
|
||||
bool debug_init;
|
||||
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
|
||||
// otherwise, select channel number by wakenet
|
||||
} afe_config_t;
|
||||
|
||||
/**
|
||||
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
|
||||
* on the chip target and input format. You can manually fine-tune it after creating the configuration
|
||||
*
|
||||
* The input format:
|
||||
* M to represent the microphone channel
|
||||
* R to represent the playback reference channel
|
||||
* N to represent an unknown or unused channel
|
||||
*
|
||||
* For example, input_format="MMNR" indicates that the input data consists of four channels,
|
||||
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
|
||||
*
|
||||
* @param input_format The input format
|
||||
* @param models Models from partition, which is configured by Kconfig
|
||||
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
|
||||
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
*
|
||||
* @return afe_config_t* The default config of afe
|
||||
*/
|
||||
afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
|
||||
|
||||
/**
|
||||
* @brief Check AFE configuration and make sure it is correct.
|
||||
*
|
||||
* @warning If there is a configuration conflict, this function will modify some parameters.
|
||||
* The guiding behind these modifications is to maintain the highest performance of the output audio and results.
|
||||
* And remove the conflict between different algorithms.
|
||||
*
|
||||
* For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
|
||||
* If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
|
||||
*
|
||||
* @param afe_config Input AFE config
|
||||
*
|
||||
* @return afe_config_t* The modified AFE config
|
||||
*/
|
||||
afe_config_t *afe_config_check(afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Parse input format
|
||||
*
|
||||
* @param input_format The input format, same with afe_config_init() function
|
||||
* @param pcm_config The pcm config
|
||||
*
|
||||
* @return true if the input format is parsed successfully, otherwise false
|
||||
*/
|
||||
bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);
|
||||
|
||||
/**
|
||||
* @brief Parse I2S input data
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param mic_data The output microphone data
|
||||
* @param ref_data The output playback reference data
|
||||
* @param pcm_config The pcm config
|
||||
*
|
||||
*/
|
||||
void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);
|
||||
|
||||
/**
|
||||
* @brief Parse input data, from interleaved arrangement to contiguous arrangement
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param channel_num The channel number of data
|
||||
* @param out_data The output data
|
||||
*
|
||||
*/
|
||||
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Format input data, from contiguous arrangement to interleaved arrangement
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param channel_num The channel number of data
|
||||
* @param out_data The output data
|
||||
*
|
||||
*/
|
||||
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Adjust the gain of input data
|
||||
*
|
||||
* @warning the input data will be modified inplace.
|
||||
*
|
||||
* @param data The input audio data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param factor The gain factor
|
||||
*
|
||||
* @return int16_t* The output audio data
|
||||
*/
|
||||
int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);
|
||||
|
||||
/**
|
||||
* @brief Adjust the gain of input data
|
||||
*
|
||||
* @warning the input data will be modified inplace.
|
||||
*
|
||||
* @param in_data The input audio data
|
||||
* @param in_frame_size Input data frame size of input
|
||||
* @param channel_num The channel number of input data, which is same as output data
|
||||
* @param out_data The output audio data
|
||||
* @param out_frame_size Onput data frame size of input
|
||||
*
|
||||
*/
|
||||
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);
|
||||
|
||||
/**
|
||||
* @brief Copy the afe config
|
||||
*
|
||||
* @param dst_config The destination afe config
|
||||
* @param src_config The source afe config
|
||||
*
|
||||
* @return The destination afe config
|
||||
*/
|
||||
afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
|
||||
|
||||
/**
|
||||
* @brief Print the afe config
|
||||
*
|
||||
* @param afe_config The afe config
|
||||
*/
|
||||
void afe_config_print(const afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Allocate afe config
|
||||
*
|
||||
* @return The afe config pointer
|
||||
*/
|
||||
afe_config_t *afe_config_alloc();
|
||||
|
||||
/**
|
||||
* @brief Free afe config
|
||||
*
|
||||
* @param afe_config The afe config pointer
|
||||
*/
|
||||
void afe_config_free(afe_config_t *afe_config);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
48
include/esp32c6/esp_afe_doa.h
Normal file
48
include/esp32c6/esp_afe_doa.h
Normal file
@ -0,0 +1,48 @@
|
||||
#ifndef _ESP_AFE_DOA_H_
|
||||
#define _ESP_AFE_DOA_H_
|
||||
|
||||
#include "esp_doa.h"
|
||||
#include "esp_afe_config.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
doa_handle_t *doa_handle;
|
||||
afe_pcm_config_t pcm_config;
|
||||
int16_t *leftdata;
|
||||
int16_t *rightdata;
|
||||
int frame_size;
|
||||
} afe_doa_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Initialize SRP-PHAT processor
|
||||
* @param input_format The input format
|
||||
* @param fs Sampling rate (Hz), e.g., 16000
|
||||
* @param resolution Angular search resolution (degrees), e.g., 20
|
||||
* @param d_mics Microphone spacing (meters), e.g., 0.06
|
||||
* @param input_timedate_samples input timedate samples, e.g., 1024
|
||||
* @return Initialized doa_handle_t object pointer, Recommend using the above configuration for better performance
|
||||
*/
|
||||
afe_doa_handle_t *afe_doa_create(const char *input_format, int fs, float resolution, float d_mics, int input_timedate_samples);
|
||||
/**
|
||||
* @brief Process audio frame for direction estimation
|
||||
* @param handle doa_handle_t instance pointer
|
||||
* @param indata Input audio data, format is define by input_format.
|
||||
* @return Estimated sound direction in degrees, e.g., 0-180
|
||||
*/
|
||||
float afe_doa_process(afe_doa_handle_t *handle, const int16_t *indata);
|
||||
/**
|
||||
* @brief Release all allocated resources
|
||||
* @param doa doa_handle_t instance pointer to be freed
|
||||
*/
|
||||
void afe_doa_destroy(afe_doa_handle_t *handle);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ESP_AFE_DOA_H_ */
|
||||
237
include/esp32c6/esp_afe_sr_iface.h
Normal file
237
include/esp32c6/esp_afe_sr_iface.h
Normal file
@ -0,0 +1,237 @@
|
||||
#pragma once
|
||||
#include "esp_afe_config.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdint.h"
|
||||
#include "stdlib.h"
|
||||
#include "freertos/FreeRTOS.h"
|
||||
#include "freertos/task.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// AFE: Audio Front-End
|
||||
// SR: Speech Recognition
|
||||
// afe_sr/AFE_SR: the audio front-end for speech recognition
|
||||
|
||||
// Opaque AFE_SR data container
|
||||
typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
|
||||
|
||||
/**
|
||||
* @brief The state of vad
|
||||
*/
|
||||
typedef enum {
|
||||
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
|
||||
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
|
||||
} afe_vad_state_t;
|
||||
|
||||
/**
|
||||
* @brief The result of fetch function
|
||||
*/
|
||||
typedef struct afe_fetch_result_t {
|
||||
int16_t *data; // the target channel data of audio.
|
||||
int data_size; // the size of data. The unit is byte.
|
||||
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
|
||||
// audio that was truncated.
|
||||
int vad_cache_size; // the size of vad_cache. The unit is byte.
|
||||
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
|
||||
// (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
|
||||
// wakenet(about 1.5s), otherwise is the frame length.
|
||||
wakenet_state_t wakeup_state; // the value is wakenet_state_t
|
||||
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
|
||||
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
|
||||
// start from 1.
|
||||
vad_state_t vad_state; // the value is afe_vad_state_t
|
||||
int trigger_channel_id; // the channel index of output
|
||||
int wake_word_length; // the length of wake word. The unit is the number of samples.
|
||||
int ret_value; // the return state of fetch function
|
||||
int16_t *raw_data; // the multi-channel output data of audio.
|
||||
int raw_data_channels; // the channel number of raw data
|
||||
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is buzy.
|
||||
void *reserved; // reserved for future use
|
||||
} afe_fetch_result_t;
|
||||
|
||||
/**
|
||||
* @brief Function to initialze a AFE_SR instance
|
||||
*
|
||||
* @param afe_config The config of AFE_SR
|
||||
* @returns Handle to the AFE_SR data
|
||||
*/
|
||||
typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of each channel samples per frame that need to be passed to the function
|
||||
*
|
||||
* Every speech enhancement AFE_SR processes a certain number of samples at the same time. This function
|
||||
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of samples to feed the fetch function
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the channel number
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of total channels
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the function
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the AFE_SR
|
||||
*
|
||||
* @Warning The input data should be arranged in the format of channel interleaving.
|
||||
* The last channel is reference signal if it has reference data.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
*
|
||||
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
|
||||
* `get_feed_chunksize`.
|
||||
* @return The size of input
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);
|
||||
|
||||
/**
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR
|
||||
*
|
||||
* @Warning The output is single channel data, no matter how many channels the input is.
|
||||
* Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
|
||||
* audio can be queried by the `get_fetch_chunksize`.)
|
||||
*/
|
||||
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
|
||||
*
|
||||
* @Warning The output is single channel data, no matter how many channels the input is.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result.
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
|
||||
* audio can be queried by the `get_fetch_chunksize`.)
|
||||
*/
|
||||
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
|
||||
|
||||
/**
|
||||
* @brief reset ringbuf of AFE.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Set wakenet detection threshold
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param index The wakenet index, just support 1: wakenet1 or 2: wakenet2
|
||||
* @param threshold The wakenet detection threshold, the value is between 0.4 and 0.9999.
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_set_wakenet_threshold_t)(esp_afe_sr_data_t *afe, int index, float threshold);
|
||||
|
||||
/**
|
||||
* @brief Reset wakenet detection threshold to inital state
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param index The wakenet index, just support 1: wakenet1 or 2: wakenet2
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_reset_wakenet_threshold_t)(esp_afe_sr_data_t *afe, int index);
|
||||
|
||||
/**
|
||||
* @brief Reset one function/module/algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Disable one function/module/algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 0: disabled, 1: enabled
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Enable one function/module/algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 0: disabled, 1: enabled
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Print all functions/modules/algorithms pipeline.
|
||||
* The pipeline is the order of the functions/modules/algorithms.
|
||||
* The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output]
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
*/
|
||||
typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Destroy a AFE_SR instance
|
||||
*
|
||||
* @param afe AFE_SR object to destroy
|
||||
*/
|
||||
typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a AFE_SR.
|
||||
*/
|
||||
typedef struct {
|
||||
esp_afe_sr_iface_op_create_from_config_t create_from_config;
|
||||
esp_afe_sr_iface_op_feed_t feed;
|
||||
esp_afe_sr_iface_op_fetch_t fetch;
|
||||
esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay;
|
||||
esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
|
||||
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_afe_sr_iface_op_set_wakenet_threshold_t set_wakenet_threshold;
|
||||
esp_afe_sr_iface_op_reset_wakenet_threshold_t reset_wakenet_threshold;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_wakenet;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_wakenet;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_aec;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_aec;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_se;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_se;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_vad;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_vad;
|
||||
esp_afe_sr_iface_op_reset_op_t reset_vad;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_ns;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_ns;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_agc;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_agc;
|
||||
esp_afe_sr_iface_op_print_pipeline_t print_pipeline;
|
||||
esp_afe_sr_iface_op_destroy_t destroy;
|
||||
} esp_afe_sr_iface_t;
|
||||
|
||||
// struct is used to store the AFE handle and data for the AFE task
|
||||
typedef struct {
|
||||
esp_afe_sr_data_t *afe_data;
|
||||
esp_afe_sr_iface_t *afe_handle;
|
||||
TaskHandle_t feed_task;
|
||||
TaskHandle_t fetch_task;
|
||||
} afe_task_into_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
13
include/esp32c6/esp_afe_sr_models.h
Normal file
13
include/esp32c6/esp_afe_sr_models.h
Normal file
@ -0,0 +1,13 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "esp_afe_sr_iface.h"
|
||||
|
||||
esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
47
include/esp32c6/esp_agc.h
Normal file
47
include/esp32c6/esp_agc.h
Normal file
@ -0,0 +1,47 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_AGC_H_
|
||||
#define _ESP_AGC_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
////all positive value is valid, negective is error
|
||||
typedef enum {
|
||||
ESP_AGC_SUCCESS = 0, ////success
|
||||
ESP_AGC_FAIL = -1, ////agc fail
|
||||
ESP_AGC_SAMPLE_RATE_ERROR = -2, ///sample rate can be only 8khz, 16khz, 32khz
|
||||
ESP_AGC_FRAME_SIZE_ERROR = -3, ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size
|
||||
} ESP_AGE_ERR;
|
||||
|
||||
typedef enum {
|
||||
AGC_MODE_SR = -1, // Bypass WEBRTC AGC
|
||||
AGC_MODE_0 = 0, // Only saturation protection
|
||||
AGC_MODE_1 = 1, // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
|
||||
AGC_MODE_2 = 2, // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
|
||||
AGC_MODE_3 = 3, // Fixed Digital Gain [compressionGaindB (default 8 dB)]
|
||||
} agc_mode_t;
|
||||
|
||||
void *esp_agc_open(agc_mode_t agc_mode, int sample_rate);
|
||||
void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs);
|
||||
int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate);
|
||||
void esp_agc_close(void *agc_handle);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // _ESP_AGC_H_
|
||||
41
include/esp32c6/esp_doa.h
Normal file
41
include/esp32c6/esp_doa.h
Normal file
@ -0,0 +1,41 @@
|
||||
#ifndef _ESP_DOA_H_
|
||||
#define _ESP_DOA_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct doa_handle_t doa_handle_t;
|
||||
/**
|
||||
* @brief Initialize SRP-PHAT processor
|
||||
* @param fs Sampling rate (Hz), e.g., 16000
|
||||
* @param resolution Angular search resolution (degrees), e.g., 20
|
||||
* @param d_mics Microphone spacing (meters), e.g., 0.06
|
||||
* @param input_timedate_samples input timedate samples, e.g., 1024
|
||||
* @return Initialized doa_handle_t object pointer, Recommend using the above configuration for better performance
|
||||
*/
|
||||
doa_handle_t *esp_doa_create(int fs, float resolution, float d_mics, int input_timedate_samples);
|
||||
|
||||
/**
|
||||
* @brief Release all allocated resources
|
||||
* @param doa doa_handle_t instance pointer to be freed
|
||||
*/
|
||||
void esp_doa_destroy(doa_handle_t *doa);
|
||||
|
||||
/**
|
||||
* @brief Process audio frame for direction estimation
|
||||
* @param doa doa_handle_t instance pointer
|
||||
* @param left Left channel 16-bit PCM data
|
||||
* @param right Right channel 16-bit PCM data
|
||||
* @return Estimated sound direction in degrees, e.g., 0-180
|
||||
*/
|
||||
float esp_doa_process(doa_handle_t *doa, int16_t* left, int16_t* right);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ESP_DOA_H_ */
|
||||
93
include/esp32c6/esp_mase.h
Normal file
93
include/esp32c6/esp_mase.h
Normal file
@ -0,0 +1,93 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_MASE_H_
|
||||
#define _ESP_MASE_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define MASE_SAMPLE_RATE 16000 // Supports 16kHz only
|
||||
#define MASE_FRAME_SIZE 16 // Supports 16ms only
|
||||
#define MASE_MIC_DISTANCE 65 // According to physical design of mic-array
|
||||
|
||||
/**
|
||||
* @brief Sets mic-array type, currently 2-mic line array and 3-mic circular array
|
||||
* are supported.
|
||||
*/
|
||||
typedef enum {
|
||||
TWO_MIC_LINE = 0,
|
||||
THREE_MIC_CIRCLE = 1
|
||||
} mase_mic_array_type_t;
|
||||
|
||||
/**
|
||||
* @brief Sets operating mode, supporting normal mode and wake-up enhancement mode
|
||||
*/
|
||||
typedef enum {
|
||||
NORMAL_ENHANCEMENT_MODE = 0,
|
||||
WAKE_UP_ENHANCEMENT_MODE = 1
|
||||
} mase_op_mode_t;
|
||||
|
||||
typedef void* mase_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the MASE structure.
|
||||
*
|
||||
* @param sample_rate The sampling frequency (Hz) must be 16000.
|
||||
*
|
||||
* @param frame_size The length of the audio processing must be 16ms.
|
||||
*
|
||||
* @param array_type '0' for 2-mic line array and '1' for 3-mic circular array.
|
||||
*
|
||||
* @param mic_distance The distance between neiboring microphones in mm.
|
||||
*
|
||||
* @param operating_mode '0' for normal mode and '1' for wake-up enhanced mode.
|
||||
*
|
||||
* @param filter_strength Strengh of the mic-array speech enhancement, must be 0, 1, 2 or 3.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: An instance of MASE
|
||||
*/
|
||||
mase_handle_t mase_create(int fs, int frame_size, int array_type, float mic_distance, int operating_mode, int filter_strength);
|
||||
|
||||
/**
|
||||
* @brief Performs mic array processing for one frame.
|
||||
*
|
||||
* @param inst The instance of MASE.
|
||||
*
|
||||
* @param in An array of 16-bit signed audio samples from mic.
|
||||
*
|
||||
* @param dsp_out Returns enhanced signal.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void mase_process(mase_handle_t st, int16_t *in, int16_t *dsp_out);
|
||||
|
||||
/**
|
||||
* @brief Free the MASE instance
|
||||
*
|
||||
* @param inst The instance of MASE.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void mase_destory(mase_handle_t st);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@ -12,7 +12,7 @@ esp_mfcc_opts_t *get_mfcc_opts_wn9();
|
||||
/**
|
||||
* @brief Return basic opts used in wakenet9s
|
||||
**/
|
||||
esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
|
||||
esp_mfcc_opts_t *get_mfcc_opts(const char *win_type, bool use_power, int winstep_ms, int winlen_ms, int nfilter);
|
||||
|
||||
/**
|
||||
* @brief Return basic opts for default kaldifeat
|
||||
|
||||
223
include/esp32c6/esp_mn_iface.h
Normal file
223
include/esp32c6/esp_mn_iface.h
Normal file
@ -0,0 +1,223 @@
|
||||
#pragma once
|
||||
#include "stdint.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define ESP_MN_RESULT_MAX_NUM 5
|
||||
#define ESP_MN_MAX_PHRASE_NUM 400
|
||||
#define ESP_MN_MAX_PHRASE_LEN 63
|
||||
#define ESP_MN_MIN_PHRASE_LEN 2
|
||||
|
||||
#define ESP_MN_PREFIX "mn"
|
||||
#define ESP_MN_ENGLISH "en"
|
||||
#define ESP_MN_CHINESE "cn"
|
||||
|
||||
typedef enum {
|
||||
ESP_MN_STATE_DETECTING = 0, // detecting
|
||||
ESP_MN_STATE_DETECTED = 1, // detected
|
||||
ESP_MN_STATE_TIMEOUT = 2, // time out
|
||||
} esp_mn_state_t;
|
||||
|
||||
//Set multinet loading mode
|
||||
//The memory comsumption is decreased with increasing mode,
|
||||
//As a consequence also the CPU loading rate goes up
|
||||
typedef enum {
|
||||
ESP_MN_LOAD_FROM_PSRAM = 0, // Load all weights from PSRAM. Fastest computation with Maximum memory consumption
|
||||
ESP_MN_LOAD_FROM_PSRAM_FLASH = 1, // Load some weights from PSRAM and laod the rest from FLASH (default)
|
||||
ESP_MN_LOAD_FROM_FLASH = 2, // Load more weights from FLASH. Minimum memory consumption with slowest computation
|
||||
} esp_mn_loader_mode_t;
|
||||
|
||||
typedef enum {
|
||||
ESP_MN_GREEDY_SEARCH = 0, // greedy search
|
||||
ESP_MN_BEAM_SEARCH = 1, // beam search
|
||||
ESP_MN_BEAM_SEARCH_WITH_FST = 2, // beam search with trie language model
|
||||
} esp_mn_search_method_t;
|
||||
|
||||
typedef enum {
|
||||
CHINESE_ID = 1, // Chinese language
|
||||
ENGLISH_ID = 2, // English language
|
||||
} language_id_t;
|
||||
|
||||
// Return all possible recognition results
|
||||
typedef struct{
|
||||
esp_mn_state_t state;
|
||||
int num; // The number of phrase in list, num<=5. When num=0, no phrase is recognized.
|
||||
int command_id[ESP_MN_RESULT_MAX_NUM]; // The list of command id.
|
||||
int phrase_id[ESP_MN_RESULT_MAX_NUM]; // The list of phrase id.
|
||||
float prob[ESP_MN_RESULT_MAX_NUM]; // The list of probability.
|
||||
char string[256];
|
||||
} esp_mn_results_t;
|
||||
|
||||
typedef struct {
|
||||
char *string; // command string
|
||||
char *phonemes; // command phonemes, if applicable
|
||||
int16_t command_id; // the command id
|
||||
float threshold; // trigger threshold, default: 0
|
||||
int16_t *wave; // prompt wave data of the phrase
|
||||
} esp_mn_phrase_t;
|
||||
|
||||
typedef struct _mn_node_ {
|
||||
esp_mn_phrase_t *phrase;
|
||||
struct _mn_node_ *next;
|
||||
} esp_mn_node_t;
|
||||
|
||||
typedef struct{
|
||||
int16_t num; // The number of error phrases, which can not added into model
|
||||
esp_mn_phrase_t **phrases; // The array of error phrase pointer
|
||||
} esp_mn_error_t;
|
||||
|
||||
/**
|
||||
* @brief Initialze a model instance with specified model name.
|
||||
*
|
||||
* @param model_name The wakenet model name.
|
||||
* @param duration The duration (ms) to trigger the timeout
|
||||
*
|
||||
* @returns Handle to the model data.
|
||||
*/
|
||||
typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const char *model_name, int duration);
|
||||
|
||||
/**
|
||||
* @brief Switch multinet mode to change memory consumption and CPU loading
|
||||
*
|
||||
* @warning Just Support multinet6 or later versions
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param mode The multinet loader mode
|
||||
*
|
||||
* @returns Handle to the model data.
|
||||
*/
|
||||
typedef model_iface_data_t* (*esp_mn_iface_op_switch_loader_mode_t)(model_iface_data_t *model, esp_mn_loader_mode_t mode);
|
||||
|
||||
/**
|
||||
* @brief Callback function type to fetch the amount of samples that need to be passed to the detect function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the same time. This function
|
||||
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Callback function type to fetch the number of frames recognized by the command word
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The number of the frames recognized by the command word
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_get_samp_chunknum_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Set the detection threshold to manually abjust the probability
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param det_treshold The threshold to trigger speech commands, the range of det_threshold is 0.0~0.9999
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the detect function
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the language of model
|
||||
*
|
||||
* @param model The language name
|
||||
* @return Language name string defined in esp_mn_models.h, eg: ESP_MN_CHINESE, ESP_MN_ENGLISH
|
||||
*/
|
||||
typedef char * (*esp_mn_iface_op_get_language_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the speech recognition model and detect if there is a speech command found.
|
||||
*
|
||||
* @param model The model object to query.
|
||||
* @param samples An array of 16-bit signed audio samples. The array size used can be queried by the
|
||||
* get_samp_chunksize function.
|
||||
* @return The state of multinet
|
||||
*/
|
||||
typedef esp_mn_state_t (*esp_mn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
|
||||
|
||||
/**
|
||||
* @brief Destroy a speech commands recognition model
|
||||
*
|
||||
* @param model The Model object to destroy
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_destroy_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get recognition results
|
||||
*
|
||||
* @param model The Model object to query
|
||||
*
|
||||
* @return The current results.
|
||||
*/
|
||||
typedef esp_mn_results_t* (*esp_mn_iface_op_get_results_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Open the log print
|
||||
*
|
||||
* @param model_data The model object to query.
|
||||
*
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_open_log_t)(model_iface_data_t *model_data);
|
||||
|
||||
/**
|
||||
* @brief Clean all status of model
|
||||
*
|
||||
* @param model_data The model object to query.
|
||||
*
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_clean_t)(model_iface_data_t *model_data);
|
||||
|
||||
/**
|
||||
* @brief Set the speech commands by mn_command_root
|
||||
*
|
||||
* @param model_data The model object to query.
|
||||
* @param mn_command_root The speech commands link.
|
||||
* @return The error phrase id info.
|
||||
*/
|
||||
typedef esp_mn_error_t* (*esp_wn_iface_op_set_speech_commands)(model_iface_data_t *model_data, esp_mn_node_t *mn_command_root);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Print out current commands in fst, note the ones "added" but not "updated" will not be shown here
|
||||
*
|
||||
* @param model_data The model object to query
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_print_active_speech_commands)(model_iface_data_t *model_data);
|
||||
|
||||
/**
|
||||
* @brief Check if input string can be tokenized
|
||||
*
|
||||
* @param model_data The model object to query
|
||||
* @param str The input string
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_check_speech_command)(model_iface_data_t *model_data, const char *str);
|
||||
|
||||
typedef struct {
|
||||
esp_mn_iface_op_create_t create;
|
||||
esp_mn_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_mn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
||||
esp_mn_iface_op_get_samp_chunknum_t get_samp_chunknum;
|
||||
esp_mn_iface_op_set_det_threshold_t set_det_threshold;
|
||||
esp_mn_iface_op_get_language_t get_language;
|
||||
esp_mn_iface_op_detect_t detect;
|
||||
esp_mn_iface_op_destroy_t destroy;
|
||||
esp_mn_iface_op_get_results_t get_results;
|
||||
esp_mn_iface_op_open_log_t open_log;
|
||||
esp_mn_iface_op_clean_t clean;
|
||||
esp_wn_iface_op_set_speech_commands set_speech_commands;
|
||||
esp_mn_iface_op_switch_loader_mode_t switch_loader_mode;
|
||||
esp_mn_iface_op_print_active_speech_commands print_active_speech_commands;
|
||||
esp_mn_iface_op_check_speech_command check_speech_command;
|
||||
} esp_mn_iface_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
66
include/esp32c6/esp_mn_models.h
Normal file
66
include/esp32c6/esp_mn_models.h
Normal file
@ -0,0 +1,66 @@
|
||||
#pragma once
|
||||
#include "esp_mn_iface.h"
|
||||
|
||||
//Contains declarations of all available speech recognion models. Pair this up with the right coefficients and you have a model that can recognize
|
||||
//a specific phrase or word.
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
/**
|
||||
* @brief Get the multinet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of multinet
|
||||
*/
|
||||
esp_mn_iface_t *esp_mn_handle_from_name(char *model_name);
|
||||
|
||||
/**
|
||||
* @brief Get the multinet language from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The language of multinet
|
||||
*/
|
||||
char *esp_mn_language_from_name(char *model_name);
|
||||
|
||||
/*
|
||||
Configure wake word to use based on what's selected in menuconfig.
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_SR_MN_CN_MULTINET2_SINGLE_RECOGNITION
|
||||
#include "multinet2_ch.h"
|
||||
#define MULTINET_COEFF get_coeff_multinet2_ch
|
||||
#define MULTINET_MODEL_NAME "mn2_cn"
|
||||
|
||||
#else
|
||||
#define MULTINET_COEFF "COEFF_NULL"
|
||||
#define MULTINET_MODEL_NAME "NULL"
|
||||
#endif
|
||||
|
||||
|
||||
/* example
|
||||
|
||||
static const esp_mn_iface_t *multinet = &MULTINET_MODEL;
|
||||
|
||||
//Initialize MultiNet model data
|
||||
model_iface_data_t *model_data = multinet->create(&MULTINET_COEFF);
|
||||
add_speech_commands(multinet, model_data);
|
||||
|
||||
//Set parameters of buffer
|
||||
int audio_chunksize=model->get_samp_chunksize(model_data);
|
||||
int frequency = model->get_samp_rate(model_data);
|
||||
int16_t *buffer=malloc(audio_chunksize*sizeof(int16_t));
|
||||
|
||||
//Detect
|
||||
int r=model->detect(model_data, buffer);
|
||||
if (r>0) {
|
||||
printf("Detection triggered output %d.\n", r);
|
||||
}
|
||||
|
||||
//Destroy model
|
||||
model->destroy(model_data)
|
||||
|
||||
*/
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
86
include/esp32c6/esp_ns.h
Normal file
86
include/esp32c6/esp_ns.h
Normal file
@ -0,0 +1,86 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_NS_H_
|
||||
#define _ESP_NS_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define NS_USE_SPIARM 0
|
||||
#define NS_FRAME_LENGTH_MS 10 //Supports 10ms, 20ms, 30ms
|
||||
|
||||
/**
|
||||
* The Sampling frequency (Hz) must be 16000Hz
|
||||
*/
|
||||
|
||||
typedef void* ns_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the NS structure.
|
||||
*
|
||||
* @param frame_length The length of the audio processing can be 10ms, 20ms, 30ms.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of NS
|
||||
*/
|
||||
ns_handle_t ns_create(int frame_length);
|
||||
|
||||
/**
|
||||
* @brief Creates an instance of the more powerful noise suppression algorithm.
|
||||
*
|
||||
* @warning frame_length only supports be 10 ms.
|
||||
*
|
||||
* @param frame_length The length of the audio processing can only be 10ms.
|
||||
* @param mode 0: Mild, 1: Medium, 2: Aggressive
|
||||
* @param sample_rate The sample rate of the audio.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of NS
|
||||
*/
|
||||
ns_handle_t ns_pro_create(int frame_length, int mode, int sample_rate);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the NS and get the audio stream after Noise suppression.
|
||||
*
|
||||
* @param inst The instance of NS.
|
||||
*
|
||||
* @param indata An array of 16-bit signed audio samples.
|
||||
*
|
||||
* @param outdata An array of 16-bit signed audio samples after noise suppression.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void ns_process(ns_handle_t inst, int16_t *indata, int16_t *outdata);
|
||||
|
||||
/**
|
||||
* @brief Free the NS instance
|
||||
*
|
||||
* @param inst The instance of NS.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void ns_destroy(ns_handle_t inst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //_ESP_NS_H_
|
||||
64
include/esp32c6/esp_nsn_iface.h
Normal file
64
include/esp32c6/esp_nsn_iface.h
Normal file
@ -0,0 +1,64 @@
|
||||
#pragma once
|
||||
#include "stdint.h"
|
||||
|
||||
//Opaque model data container
|
||||
typedef struct esp_nsn_data_t esp_nsn_data_t;
|
||||
|
||||
|
||||
/**
|
||||
* @brief Easy function type to initialze a model instance
|
||||
*
|
||||
* @param model_name The name of the model instance
|
||||
* @returns Handle to the model data
|
||||
*/
|
||||
typedef esp_nsn_data_t* (*esp_nsn_iface_op_create_t)(char *model_name);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of samples that need to be passed to the process function
|
||||
*
|
||||
* Every noise suppression model processes a certain number of samples at the same time. This function
|
||||
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the process function
|
||||
*/
|
||||
typedef int (*esp_nsn_iface_op_get_samp_chunksize_t)(esp_nsn_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the noise suppression model and get data after process.
|
||||
*
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param in_data An array of 16-bit signed audio samples. The array size used can be queried by the
|
||||
* get_samp_chunksize function.
|
||||
* @param out_data An array of 16-bit signed audio samples after process.
|
||||
* @return The state of return.
|
||||
*/
|
||||
typedef int (*esp_nsn_iface_op_process_t)(esp_nsn_data_t *model, int16_t *in_data, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the process function
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_nsn_iface_op_get_samp_rate_t)(esp_nsn_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Destroy a noise suppression model
|
||||
*
|
||||
* @param model Model object to destroy
|
||||
*/
|
||||
typedef void (*esp_nsn_iface_op_destroy_t)(esp_nsn_data_t *model);
|
||||
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a wake word detection model.
|
||||
*/
|
||||
typedef struct {
|
||||
esp_nsn_iface_op_create_t create;
|
||||
esp_nsn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
||||
esp_nsn_iface_op_process_t process;
|
||||
esp_nsn_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_nsn_iface_op_destroy_t destroy;
|
||||
} esp_nsn_iface_t;
|
||||
17
include/esp32c6/esp_nsn_models.h
Normal file
17
include/esp32c6/esp_nsn_models.h
Normal file
@ -0,0 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#include "esp_nsn_iface.h"
|
||||
|
||||
/*
|
||||
The prefix of nset
|
||||
Now there are nsnet1 and nsnet2
|
||||
*/
|
||||
#define ESP_NSNET_PREFIX "nsnet"
|
||||
|
||||
/**
|
||||
* @brief Get the nsnet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of multinet
|
||||
*/
|
||||
esp_nsn_iface_t *esp_nsnet_handle_from_name(char *model_name);
|
||||
@ -48,7 +48,7 @@ float *esp_win_func_init(char *win_type, float *window_data, int frame_length);
|
||||
|
||||
float *esp_fftr(float *x, int nfft, void *fft_table);
|
||||
|
||||
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
|
||||
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_handle);
|
||||
|
||||
void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
|
||||
|
||||
|
||||
84
include/esp32c6/esp_sr_webrtc.h
Normal file
84
include/esp32c6/esp_sr_webrtc.h
Normal file
@ -0,0 +1,84 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_WEBRTC_H_
|
||||
#define _ESP_WEBRTC_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#include "esp_agc.h"
|
||||
#include "esp_log.h"
|
||||
#include "esp_ns.h"
|
||||
#include "sr_ringbuf.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#include "esp_heap_caps.h"
|
||||
|
||||
typedef struct {
|
||||
void *ns_handle;
|
||||
void *agc_handle;
|
||||
int frame_size;
|
||||
int sample_rate;
|
||||
int16_t *buff;
|
||||
int16_t *out_data;
|
||||
sr_ringbuf_handle_t rb;
|
||||
} webrtc_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance of webrtc.
|
||||
*
|
||||
* @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms.
|
||||
*
|
||||
* @param frame_length_ms The length of the audio processing
|
||||
* @param ns_mode The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive
|
||||
* @param agc_mode The model of AGC
|
||||
* @param agc_gain The gain of AGC. default is 9
|
||||
* @param agc_target_level The target level of AGC. default is -3 dbfs
|
||||
* @param sample_rate The sample rate of the audio.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of webrtc
|
||||
*/
|
||||
webrtc_handle_t *webrtc_create(
|
||||
int frame_length_ms, int ns_mode, agc_mode_t agc_mode, int agc_gain, int agc_target_level, int sample_rate);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression.
|
||||
*
|
||||
* @param handle The instance of NS.
|
||||
* @param in_data An array of 16-bit signed audio samples.
|
||||
* @param out_size The sample size of output data
|
||||
* @param enable_ns Enable noise suppression
|
||||
* @param enable_agc Enable automatic gain control
|
||||
*
|
||||
* @return data after noise suppression
|
||||
*/
|
||||
int16_t *webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc);
|
||||
|
||||
/**
|
||||
* @brief Free the webrtc instance
|
||||
*
|
||||
* @param handle The instance of webrtc.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void webrtc_destroy(webrtc_handle_t *handle);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //_ESP_NS_H_
|
||||
178
include/esp32c6/esp_vad.h
Normal file
178
include/esp32c6/esp_vad.h
Normal file
@ -0,0 +1,178 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_VAD_H_
|
||||
#define _ESP_VAD_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define SAMPLE_RATE_HZ 16000 // Supports 32000, 16000, 8000
|
||||
#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms
|
||||
|
||||
/**
|
||||
* @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
|
||||
* restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
|
||||
*/
|
||||
typedef enum {
|
||||
VAD_MODE_0 = 0, // Normal
|
||||
VAD_MODE_1, // Aggressive
|
||||
VAD_MODE_2, // Very Aggressive
|
||||
VAD_MODE_3, // Very Very Aggressive
|
||||
VAD_MODE_4 // Very Very Very Aggressive
|
||||
} vad_mode_t;
|
||||
|
||||
typedef enum {
|
||||
VAD_SILENCE = 0,
|
||||
VAD_SPEECH = 1,
|
||||
} vad_state_t;
|
||||
|
||||
typedef struct vad_trigger_tag {
|
||||
vad_state_t state;
|
||||
unsigned int min_speech_len;
|
||||
unsigned int noise_len;
|
||||
unsigned int min_noise_len;
|
||||
unsigned int speech_len;
|
||||
} vad_trigger_t;
|
||||
|
||||
#define vad_MAX_LEN INT32_MAX - 1
|
||||
/**
|
||||
* @brief Allocate wakenet trigger
|
||||
*
|
||||
* @param min_speech_len Minimum frame number of speech duration
|
||||
* @param min_noise_len Minimum frame number of noise duration
|
||||
*
|
||||
* @return Trigger pointer
|
||||
**/
|
||||
vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
|
||||
|
||||
/**
|
||||
* @brief Free wakenet trigger
|
||||
**/
|
||||
void vad_trigger_free(vad_trigger_t *trigger);
|
||||
|
||||
/**
|
||||
* @brief Reset wakenet trigger
|
||||
**/
|
||||
void vad_trigger_reset(vad_trigger_t *trigger);
|
||||
|
||||
/**
|
||||
* @brief detect activaty voice by trigger
|
||||
**/
|
||||
vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
|
||||
|
||||
typedef struct {
|
||||
vad_trigger_t *trigger;
|
||||
void *vad_inst;
|
||||
int sample_rate;
|
||||
int frame_size;
|
||||
} vad_handle_with_trigger_t;
|
||||
|
||||
typedef vad_handle_with_trigger_t *vad_handle_t;
|
||||
|
||||
// typedef vad_handle_tag * vad_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the VAD structure.
|
||||
*
|
||||
* @param vad_mode Sets the VAD operating mode.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of VAD
|
||||
*/
|
||||
vad_handle_t vad_create(vad_mode_t vad_mode);
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the VAD structure.
|
||||
*
|
||||
* @param vad_mode Sets the VAD operating mode.
|
||||
* @param sample_rate Sample rate in Hz
|
||||
* @param one_frame_ms Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30.
|
||||
* @param min_speech_ms Minimum speech duration, unit is ms
|
||||
* @param min_noise_ms Minimum noise duration, unit is ms
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of VAD
|
||||
*/
|
||||
vad_handle_t vad_create_with_param(
|
||||
vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
|
||||
*
|
||||
* @param handle The instance of VAD.
|
||||
* @param data An array of 16-bit signed audio samples.
|
||||
* @param sample_rate_hz The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
|
||||
* @param one_frame_ms The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
|
||||
* @return
|
||||
* - VAD_SILENCE if no voice
|
||||
* - VAD_SPEECH if voice is detected
|
||||
*
|
||||
*/
|
||||
vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
|
||||
*
|
||||
* @param handle The instance of VAD.
|
||||
* @param data An array of 16-bit signed audio samples.
|
||||
* @return
|
||||
* - VAD_SILENCE if no voice
|
||||
* - VAD_SPEECH if voice is detected
|
||||
*
|
||||
*/
|
||||
vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
|
||||
|
||||
/**
|
||||
* @brief Reset trigger state as Silence
|
||||
*
|
||||
* @param handle The instance of VAD.
|
||||
*/
|
||||
void vad_reset_trigger(vad_handle_t handle);
|
||||
|
||||
/**
|
||||
* @brief Free the VAD instance
|
||||
*
|
||||
* @param inst The instance of VAD.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void vad_destroy(vad_handle_t inst);
|
||||
|
||||
/*
|
||||
* Programming Guide:
|
||||
*
|
||||
* @code{c}
|
||||
* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to
|
||||
* the VAD structure.
|
||||
*
|
||||
* while (1) {
|
||||
* //Use buffer to receive the audio data from MIC.
|
||||
* vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result.
|
||||
* }
|
||||
*
|
||||
* vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process
|
||||
*
|
||||
* @endcode
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //_ESP_VAD_H_
|
||||
164
include/esp32c6/esp_vadn_iface.h
Normal file
164
include/esp32c6/esp_vadn_iface.h
Normal file
@ -0,0 +1,164 @@
|
||||
#pragma once
|
||||
#include "esp_vad.h"
|
||||
#include "stdint.h"
|
||||
#include "dl_lib_convq_queue.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Opaque model data container
|
||||
typedef struct model_iface_data_t model_iface_data_t;
|
||||
|
||||
// /**
|
||||
// * @brief The state of vad
|
||||
// */
|
||||
// typedef enum {
|
||||
// VAD_NOISE = -1, // Noise
|
||||
// VADNET_STATE_SILENCE = 0, // Silence
|
||||
// VAD_SPEECH = 1 // Speech
|
||||
// } vad_state_t;
|
||||
|
||||
/**
|
||||
* @brief Easy function type to initialze a model instance with a detection mode
|
||||
* and specified model name
|
||||
*
|
||||
* @param model_name The specified model name
|
||||
* @param mode The voice activity detection mode
|
||||
* @param channel_num The number of input audio channels
|
||||
* @param min_speech_ms The minimum duration of speech in ms to trigger vad
|
||||
* speech
|
||||
* @param min_noise_ms The minimum duration of noise in ms to trigger vad
|
||||
* noise
|
||||
* @returns Handle to the model data
|
||||
*/
|
||||
typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)(
|
||||
const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of samples that need to be passed to the detect
|
||||
* function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the
|
||||
* same time. This function can be used to query that amount. Note that the
|
||||
* returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the channel number of samples that need to be passed to the detect
|
||||
* function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the
|
||||
* same time. This function can be used to query that amount. Note that the
|
||||
* returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the detect function
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Set the detection threshold to manually abjust the probability
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param det_treshold The threshold to trigger wake words, the range of
|
||||
* det_threshold is 0.5~0.9999
|
||||
* @return 0: setting failed, 1: setting success
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
|
||||
|
||||
/**
|
||||
* @brief Get the voice activity detection threshold
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @returns the detection threshold
|
||||
*/
|
||||
typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the vad model and detect whether is
|
||||
* voice.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param samples An array of 16-bit signed audio samples. The array size used
|
||||
* can be queried by the get_samp_chunksize function.
|
||||
* @return The index of wake words, return 0 if no wake word is detected, else
|
||||
* the index of the wake words.
|
||||
*/
|
||||
typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
|
||||
|
||||
/**
|
||||
* @brief Feed MFCC of an audio stream to the vad model and detect whether is
|
||||
* voice.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param cq An array of 16-bit MFCC.
|
||||
* @return The index of wake words, return 0 if no wake word is detected, else
|
||||
* the index of the wake words.
|
||||
*/
|
||||
typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq);
|
||||
|
||||
/**
|
||||
* @brief Get MFCC of an audio stream
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return MFCC data
|
||||
*/
|
||||
typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the triggered channel index. Channel index starts from zero
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The channel index
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Clean all states of model
|
||||
*
|
||||
* @param model The model object to query
|
||||
*/
|
||||
typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Destroy a model object
|
||||
*
|
||||
* @param model Model object to destroy
|
||||
*/
|
||||
typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a voice
|
||||
* activity detection model.
|
||||
*/
|
||||
typedef struct {
|
||||
esp_vadn_iface_op_create_t create;
|
||||
esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
||||
esp_vadn_iface_op_get_channel_num_t get_channel_num;
|
||||
esp_vadn_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_vadn_iface_op_set_det_threshold_t set_det_threshold;
|
||||
esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
|
||||
esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
|
||||
esp_vadn_iface_op_detect_t detect;
|
||||
esp_vadn_iface_op_detect_mfcc_t detect_mfcc;
|
||||
esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data;
|
||||
esp_vadn_iface_op_clean_t clean;
|
||||
esp_vadn_iface_op_destroy_t destroy;
|
||||
} esp_vadn_iface_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
22
include/esp32c6/esp_vadn_models.h
Normal file
22
include/esp32c6/esp_vadn_models.h
Normal file
@ -0,0 +1,22 @@
|
||||
#pragma once
|
||||
#include "esp_vadn_iface.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// The prefix of vadnet model name is used to filter all wakenet from availabel models.
|
||||
#define ESP_VADN_PREFIX "vadnet"
|
||||
|
||||
/**
|
||||
* @brief Get the wakenet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of wakenet
|
||||
*/
|
||||
const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@ -29,6 +29,7 @@ typedef enum {
|
||||
DET_MODE_2CH_95 = 3,
|
||||
DET_MODE_3CH_90 = 4,
|
||||
DET_MODE_3CH_95 = 5,
|
||||
DET_MODE_90_COPY_PARAMS = 6, // Aggressive
|
||||
} det_mode_t;
|
||||
|
||||
typedef struct {
|
||||
@ -110,12 +111,21 @@ typedef char* (*esp_wn_iface_op_get_word_name_t)(model_iface_data_t *model, int
|
||||
* @brief Set the detection threshold to manually abjust the probability
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.5~0.9999
|
||||
* @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.4~0.9999
|
||||
* @param word_index The index of wake word
|
||||
* @return 0: setting failed, 1: setting success
|
||||
*/
|
||||
typedef int (*esp_wn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold, int word_index);
|
||||
|
||||
/**
|
||||
* @brief Reset the threshold to its initial state
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return 0: setting failed, 1: setting success
|
||||
*/
|
||||
typedef int (*esp_wn_iface_op_reset_det_threshold_t)(model_iface_data_t *model);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Get the wake word detection threshold of different modes
|
||||
*
|
||||
@ -200,6 +210,7 @@ typedef struct {
|
||||
esp_wn_iface_op_get_word_num_t get_word_num;
|
||||
esp_wn_iface_op_get_word_name_t get_word_name;
|
||||
esp_wn_iface_op_set_det_threshold_t set_det_threshold;
|
||||
esp_wn_iface_op_reset_det_threshold_t reset_det_threshold;
|
||||
esp_wn_iface_op_get_det_threshold_t get_det_threshold;
|
||||
esp_wn_iface_op_get_triggered_channel_t get_triggered_channel;
|
||||
esp_wn_iface_op_get_vol_gain_t get_vol_gain;
|
||||
|
||||
@ -11,7 +11,7 @@ extern "C" {
|
||||
/**
|
||||
* @brief Get the wakenet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of wakenet
|
||||
*/
|
||||
const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
|
||||
@ -19,10 +19,10 @@ const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
|
||||
/**
|
||||
* @brief Get the wake word name from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @param model_name The name of model
|
||||
* @returns The wake word name, like "alexa","hilexin","xiaoaitongxue"
|
||||
*/
|
||||
char* esp_wn_wakeword_from_name(const char *model_name);
|
||||
char *esp_wn_wakeword_from_name(const char *model_name);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
20
include/esp32c6/flite_g2p.h
Normal file
20
include/esp32c6/flite_g2p.h
Normal file
@ -0,0 +1,20 @@
|
||||
#ifndef __FLITE_G2P_H__
|
||||
#define __FLITE_G2P_H__
|
||||
|
||||
typedef struct {
|
||||
int num_phonemes;
|
||||
int phoneme_size;
|
||||
char **phonemes;
|
||||
} flite_g2p_result;
|
||||
|
||||
void flite_g2p_result_free(flite_g2p_result *result);
|
||||
|
||||
flite_g2p_result *flite_g2p_get_result(const char *grapheme);
|
||||
|
||||
void flite_g2p_result_print_string(flite_g2p_result *result, int map_phonemes);
|
||||
|
||||
char *flite_g2p_result_get_string(flite_g2p_result *result, int map_phonemes);
|
||||
|
||||
char *flite_g2p(const char *graphemes, int map_phonemes);
|
||||
|
||||
#endif
|
||||
@ -12,7 +12,7 @@ esp_mfcc_opts_t *get_mfcc_opts_wn9();
|
||||
/**
|
||||
* @brief Return basic opts used in wakenet9s
|
||||
**/
|
||||
esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
|
||||
esp_mfcc_opts_t *get_mfcc_opts(const char *win_type, bool use_power, int winstep_ms, int winlen_ms, int nfilter);
|
||||
|
||||
/**
|
||||
* @brief Return basic opts for default kaldifeat
|
||||
|
||||
@ -2,9 +2,8 @@
|
||||
#ifndef _ESP_AFE_AEC_H_
|
||||
#define _ESP_AFE_AEC_H_
|
||||
|
||||
|
||||
#include "esp_afe_config.h"
|
||||
#include "esp_aec.h"
|
||||
#include "esp_afe_config.h"
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
@ -13,19 +12,19 @@ extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
aec_handle_t* handle;
|
||||
aec_handle_t *handle;
|
||||
aec_mode_t mode;
|
||||
afe_pcm_config_t pcm_config;
|
||||
int frame_size;
|
||||
int16_t *data;
|
||||
}afe_aec_handle_t;
|
||||
|
||||
int16_t *data;
|
||||
} afe_aec_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the AEC structure.
|
||||
*
|
||||
* @warning Currently only support 1 microphone channel and 1 playback channe.
|
||||
* If input has multiple microphone channels and playback channels, just the first microphone channel and playback channel will be selected.
|
||||
* @brief Creates an instance to the AEC structure.
|
||||
*
|
||||
* @warning Currently only support 1 microphone channel and 1 playback channe.
|
||||
* If input has multiple microphone channels and playback channels, just the first microphone channel and playback
|
||||
* channel will be selected.
|
||||
*
|
||||
* The input format, same as afe config:
|
||||
* M to represent the microphone channel
|
||||
@ -37,7 +36,8 @@ typedef struct {
|
||||
*
|
||||
* @param input_format The input format
|
||||
* @param filter_length The length of filter. The larger the filter, the higher the CPU loading.
|
||||
* Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for esp32c5.
|
||||
* Recommended filter_length = 4 for esp32s3 and esp32p4. Recommended filter_length = 2 for
|
||||
* esp32c5.
|
||||
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
|
||||
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
*
|
||||
@ -45,17 +45,17 @@ typedef struct {
|
||||
*/
|
||||
afe_aec_handle_t *afe_aec_create(const char *input_format, int filter_length, afe_type_t type, afe_mode_t mode);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
|
||||
*
|
||||
*
|
||||
* @param inst The instance of AEC.
|
||||
* @param indata Input audio data, format is define by input_format. Note indata will be modified in function call.
|
||||
* @param outdata Returns near-end signal with echo removed.
|
||||
* @param indata Input audio data, format is define by input_format.
|
||||
* @param outdata Near-end signal with echo removed. outdata must be 16-bit aligned.
|
||||
* please use heap_caps_aligned_calloc(16, n, size, caps) to allocate an aligned chunk of memory
|
||||
|
||||
* @return The bytes of outdata.
|
||||
*/
|
||||
size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outdata);
|
||||
size_t afe_aec_process(afe_aec_handle_t *handel, const int16_t *indata, int16_t *outdata);
|
||||
|
||||
/**
|
||||
* @brief Get frame size of AEC (the samples of one frame)
|
||||
@ -64,7 +64,6 @@ size_t afe_aec_process(afe_aec_handle_t *handel, int16_t *indata, int16_t *outda
|
||||
*/
|
||||
int afe_aec_get_chunksize(afe_aec_handle_t *handle);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Free the AEC instance
|
||||
*
|
||||
|
||||
@ -1,9 +1,15 @@
|
||||
#pragma once
|
||||
#include "esp_aec.h"
|
||||
#include "esp_agc.h"
|
||||
#include "esp_nsn_models.h"
|
||||
#include "esp_vad.h"
|
||||
#include "esp_vadn_models.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#include "esp_wn_models.h"
|
||||
#include "model_path.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdint.h"
|
||||
#include "stdlib.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
@ -27,7 +33,8 @@ typedef enum {
|
||||
// Set AFE type
|
||||
typedef enum {
|
||||
AFE_TYPE_SR = 0, // Speech recognition scenarios, excluding nonlinear noise suppression
|
||||
AFE_TYPE_VC = 1, // Voice communication scenarios, including nonlinear noise suppression
|
||||
AFE_TYPE_VC = 1, // Voice communication scenarios, 16KHz input, including nonlinear noise suppression
|
||||
AFE_TYPE_VC_8K = 2, // Voice communication scenarios, 8KHz input, note that the input data must be 8KHz
|
||||
} afe_type_t;
|
||||
|
||||
typedef enum {
|
||||
@ -62,8 +69,220 @@ typedef enum {
|
||||
AFE_AGC_MODE_WAKENET = 1, // AGC gain is calculated by wakenet model if wakenet is activated
|
||||
} afe_agc_mode_t;
|
||||
|
||||
/**
|
||||
* @brief Function to get the debug audio data
|
||||
*
|
||||
* @param data The debug audio data which don't be modify. It should be copied away as soon as possible that
|
||||
* avoid blocking for too long.
|
||||
* @param data_size The number of bytes of data.
|
||||
* @returns
|
||||
*/
|
||||
typedef void (*afe_debug_hook_callback_t)(const int16_t *data, int data_size);
|
||||
|
||||
typedef enum {
|
||||
AFE_DEBUG_HOOK_MASE_TASK_IN = 0, // To get the input data of mase task
|
||||
AFE_DEBUG_HOOK_FETCH_TASK_IN = 1, // To get the input data of fetch task
|
||||
AFE_DEBUG_HOOK_MAX = 2
|
||||
} afe_debug_hook_type_t;
|
||||
|
||||
typedef struct {
|
||||
afe_debug_hook_type_t hook_type; // debug type of hook
|
||||
afe_debug_hook_callback_t hook_callback; // callback function which transfer debug audio data
|
||||
} afe_debug_hook_t;
|
||||
|
||||
typedef struct {
|
||||
/********** AEC(Acoustic Echo Cancellation) **********/
|
||||
bool aec_init; // Whether to init aec
|
||||
aec_mode_t aec_mode; // The mode of aec, AEC_MODE_SR_LOW_COST or AEC_MODE_SR_HIGH_PERF
|
||||
int aec_filter_length; // The filter length of aec
|
||||
|
||||
/********** SE(Speech Enhancement, microphone array processing) **********/
|
||||
bool se_init; // Whether to init se
|
||||
|
||||
/********** NS(Noise Suppression) **********/
|
||||
bool ns_init; // Whether to init ns
|
||||
char *ns_model_name; // Model name of ns
|
||||
afe_ns_mode_t afe_ns_mode; // Model mode of ns
|
||||
|
||||
/********** VAD(Voice Activity Detection) **********/
|
||||
bool vad_init; // Whether to init vad
|
||||
vad_mode_t vad_mode; // The value can be: VAD_MODE_0, VAD_MODE_1, VAD_MODE_2, VAD_MODE_3, VAD_MODE_4
|
||||
char *vad_model_name; // The model name of vad, If it is null, WebRTC VAD will be used.
|
||||
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
|
||||
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
|
||||
// 1000 ms
|
||||
int vad_delay_ms; // The delay of the first speech frame in ms, default: 128 ms
|
||||
// If you find vad cache can not cover all speech, please increase this value.
|
||||
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
|
||||
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false
|
||||
|
||||
/********** WakeNet(Wake Word Engine) **********/
|
||||
bool wakenet_init;
|
||||
char *wakenet_model_name; // The model name of wakenet 1
|
||||
char *wakenet_model_name_2; // The model name of wakenet 2 if has wakenet 2
|
||||
det_mode_t wakenet_mode; // The mode of wakenet
|
||||
|
||||
/********** AGC(Automatic Gain Control) **********/
|
||||
bool agc_init; // Whether to init agc
|
||||
afe_agc_mode_t
|
||||
agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
|
||||
int agc_compression_gain_db; // Compression gain in dB (default 9)
|
||||
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default 3, means target level is -3 dBFS)
|
||||
|
||||
/********** General AFE(Audio Front End) parameter **********/
|
||||
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
|
||||
afe_mode_t afe_mode; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
afe_type_t afe_type; // The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
int afe_perferred_core; // The preferred core of afe se task, which is created in afe_create function.
|
||||
int afe_perferred_priority; // The preferred priority of afe se task, which is created in afe_create function.
|
||||
int afe_ringbuf_size; // The ring buffer size: the number of frame data in ring buffer.
|
||||
afe_memory_alloc_mode_t memory_alloc_mode; // The memory alloc mode for afe. From Internal RAM or PSRAM
|
||||
float afe_linear_gain; // The linear gain for afe output the value should be in [0.1, 10.0]. This value acts
|
||||
// directly on the output amplitude: out_linear_gain * amplitude.
|
||||
bool debug_init;
|
||||
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
|
||||
// otherwise, select channel number by wakenet
|
||||
} afe_config_t;
|
||||
|
||||
/**
|
||||
* @brief Get AFE default configuration. The default configuration will enable all algorithms as much as possible based
|
||||
* on the chip target and input format. You can manually fine-tune it after creating the configuration
|
||||
*
|
||||
* The input format:
|
||||
* M to represent the microphone channel
|
||||
* R to represent the playback reference channel
|
||||
* N to represent an unknown or unused channel
|
||||
*
|
||||
* For example, input_format="MMNR" indicates that the input data consists of four channels,
|
||||
* which are the microphone channel, the microphone channel, an unused channel, and the playback channel
|
||||
*
|
||||
* @param input_format The input format
|
||||
* @param models Models from partition, which is configured by Kconfig
|
||||
* @param type The type of afe, AFE_TYPE_SR or AFE_TYPE_VC
|
||||
* @param mode The mode of afe, AFE_MODE_LOW_COST or AFE_MODE_HIGH_PERF
|
||||
*
|
||||
* @return afe_config_t* The default config of afe
|
||||
*/
|
||||
afe_config_t *afe_config_init(const char *input_format, srmodel_list_t *models, afe_type_t type, afe_mode_t mode);
|
||||
|
||||
/**
|
||||
* @brief Check AFE configuration and make sure it is correct.
|
||||
*
|
||||
* @warning If there is a configuration conflict, this function will modify some parameters.
|
||||
* The guiding behind these modifications is to maintain the highest performance of the output audio and results.
|
||||
* And remove the conflict between different algorithms.
|
||||
*
|
||||
* For example, If input is two-channel data, the SE(BSS) algorithm will be prioritized over the NS algorithm.
|
||||
* If SE(BSS) algorithm is deactivated, will only use the first microphone channel.
|
||||
*
|
||||
* @param afe_config Input AFE config
|
||||
*
|
||||
* @return afe_config_t* The modified AFE config
|
||||
*/
|
||||
afe_config_t *afe_config_check(afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Parse input format
|
||||
*
|
||||
* @param input_format The input format, same with afe_config_init() function
|
||||
* @param pcm_config The pcm config
|
||||
*
|
||||
* @return true if the input format is parsed successfully, otherwise false
|
||||
*/
|
||||
bool afe_parse_input_format(const char *input_format, afe_pcm_config_t *pcm_config);
|
||||
|
||||
/**
|
||||
* @brief Parse I2S input data
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param mic_data The output microphone data
|
||||
* @param ref_data The output playback reference data
|
||||
* @param pcm_config The pcm config
|
||||
*
|
||||
*/
|
||||
void afe_parse_input(int16_t *data, int frame_size, int16_t *mic_data, int16_t *ref_data, afe_pcm_config_t *pcm_config);
|
||||
|
||||
/**
|
||||
* @brief Parse input data, from interleaved arrangement to contiguous arrangement
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param channel_num The channel number of data
|
||||
* @param out_data The output data
|
||||
*
|
||||
*/
|
||||
void afe_parse_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Format input data, from contiguous arrangement to interleaved arrangement
|
||||
*
|
||||
* @param data The input multi channel data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param channel_num The channel number of data
|
||||
* @param out_data The output data
|
||||
*
|
||||
*/
|
||||
void afe_format_data(int16_t *data, int frame_size, int channel_num, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Adjust the gain of input data
|
||||
*
|
||||
* @warning the input data will be modified inplace.
|
||||
*
|
||||
* @param data The input audio data
|
||||
* @param frame_size The frame size of input, it is also the size of single channel data
|
||||
* @param factor The gain factor
|
||||
*
|
||||
* @return int16_t* The output audio data
|
||||
*/
|
||||
int16_t *afe_adjust_gain(int16_t *data, int frame_size, float factor);
|
||||
|
||||
/**
|
||||
* @brief Adjust the gain of input data
|
||||
*
|
||||
* @warning the input data will be modified inplace.
|
||||
*
|
||||
* @param in_data The input audio data
|
||||
* @param in_frame_size Input data frame size of input
|
||||
* @param channel_num The channel number of input data, which is same as output data
|
||||
* @param out_data The output audio data
|
||||
* @param out_frame_size Onput data frame size of input
|
||||
*
|
||||
*/
|
||||
void afe_concat_data(int16_t *in_data, int in_frame_size, int channel_num, int16_t *out_data, int out_frame_size);
|
||||
|
||||
/**
|
||||
* @brief Copy the afe config
|
||||
*
|
||||
* @param dst_config The destination afe config
|
||||
* @param src_config The source afe config
|
||||
*
|
||||
* @return The destination afe config
|
||||
*/
|
||||
afe_config_t *afe_config_copy(afe_config_t *dst_config, const afe_config_t *src_config);
|
||||
|
||||
/**
|
||||
* @brief Print the afe config
|
||||
*
|
||||
* @param afe_config The afe config
|
||||
*/
|
||||
void afe_config_print(const afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Allocate afe config
|
||||
*
|
||||
* @return The afe config pointer
|
||||
*/
|
||||
afe_config_t *afe_config_alloc();
|
||||
|
||||
/**
|
||||
* @brief Free afe config
|
||||
*
|
||||
* @param afe_config The afe config pointer
|
||||
*/
|
||||
void afe_config_free(afe_config_t *afe_config);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
48
include/esp32s2/esp_afe_doa.h
Normal file
48
include/esp32s2/esp_afe_doa.h
Normal file
@ -0,0 +1,48 @@
|
||||
#ifndef _ESP_AFE_DOA_H_
|
||||
#define _ESP_AFE_DOA_H_
|
||||
|
||||
#include "esp_doa.h"
|
||||
#include "esp_afe_config.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct {
|
||||
doa_handle_t *doa_handle;
|
||||
afe_pcm_config_t pcm_config;
|
||||
int16_t *leftdata;
|
||||
int16_t *rightdata;
|
||||
int frame_size;
|
||||
} afe_doa_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Initialize SRP-PHAT processor
|
||||
* @param input_format The input format
|
||||
* @param fs Sampling rate (Hz), e.g., 16000
|
||||
* @param resolution Angular search resolution (degrees), e.g., 20
|
||||
* @param d_mics Microphone spacing (meters), e.g., 0.06
|
||||
* @param input_timedate_samples input timedate samples, e.g., 1024
|
||||
* @return Initialized doa_handle_t object pointer, Recommend using the above configuration for better performance
|
||||
*/
|
||||
afe_doa_handle_t *afe_doa_create(const char *input_format, int fs, float resolution, float d_mics, int input_timedate_samples);
|
||||
/**
|
||||
* @brief Process audio frame for direction estimation
|
||||
* @param handle doa_handle_t instance pointer
|
||||
* @param indata Input audio data, format is define by input_format.
|
||||
* @return Estimated sound direction in degrees, e.g., 0-180
|
||||
*/
|
||||
float afe_doa_process(afe_doa_handle_t *handle, const int16_t *indata);
|
||||
/**
|
||||
* @brief Release all allocated resources
|
||||
* @param doa doa_handle_t instance pointer to be freed
|
||||
*/
|
||||
void afe_doa_destroy(afe_doa_handle_t *handle);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ESP_AFE_DOA_H_ */
|
||||
237
include/esp32s2/esp_afe_sr_iface.h
Normal file
237
include/esp32s2/esp_afe_sr_iface.h
Normal file
@ -0,0 +1,237 @@
|
||||
#pragma once
|
||||
#include "esp_afe_config.h"
|
||||
#include "stdbool.h"
|
||||
#include "stdint.h"
|
||||
#include "stdlib.h"
|
||||
#include "freertos/FreeRTOS.h"
|
||||
#include "freertos/task.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// AFE: Audio Front-End
|
||||
// SR: Speech Recognition
|
||||
// afe_sr/AFE_SR: the audio front-end for speech recognition
|
||||
|
||||
// Opaque AFE_SR data container
|
||||
typedef struct esp_afe_sr_data_t esp_afe_sr_data_t;
|
||||
|
||||
/**
|
||||
* @brief The state of vad
|
||||
*/
|
||||
typedef enum {
|
||||
AFE_VAD_SILENCE = 0, // Deprecated, please use vad_state_t, noise or silence
|
||||
AFE_VAD_SPEECH = 1 // Deprecated, please use vad_state_t, speech
|
||||
} afe_vad_state_t;
|
||||
|
||||
/**
|
||||
* @brief The result of fetch function
|
||||
*/
|
||||
typedef struct afe_fetch_result_t {
|
||||
int16_t *data; // the target channel data of audio.
|
||||
int data_size; // the size of data. The unit is byte.
|
||||
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the
|
||||
// audio that was truncated.
|
||||
int vad_cache_size; // the size of vad_cache. The unit is byte.
|
||||
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc.
|
||||
// (note: invalid in vc). if enable wakenet, the window length is the receptive fields of
|
||||
// wakenet(about 1.5s), otherwise is the frame length.
|
||||
wakenet_state_t wakeup_state; // the value is wakenet_state_t
|
||||
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
|
||||
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index
|
||||
// start from 1.
|
||||
vad_state_t vad_state; // the value is afe_vad_state_t
|
||||
int trigger_channel_id; // the channel index of output
|
||||
int wake_word_length; // the length of wake word. The unit is the number of samples.
|
||||
int ret_value; // the return state of fetch function
|
||||
int16_t *raw_data; // the multi-channel output data of audio.
|
||||
int raw_data_channels; // the channel number of raw data
|
||||
float ringbuff_free_pct; // the percent of ringbuff free size. if the value is larger than 0.5, it means the ringbuff is buzy.
|
||||
void *reserved; // reserved for future use
|
||||
} afe_fetch_result_t;
|
||||
|
||||
/**
|
||||
* @brief Function to initialze a AFE_SR instance
|
||||
*
|
||||
* @param afe_config The config of AFE_SR
|
||||
* @returns Handle to the AFE_SR data
|
||||
*/
|
||||
typedef esp_afe_sr_data_t *(*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of each channel samples per frame that need to be passed to the function
|
||||
*
|
||||
* Every speech enhancement AFE_SR processes a certain number of samples at the same time. This function
|
||||
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of samples to feed the fetch function
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the channel number
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The amount of total channels
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_channel_num_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the function
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_get_samp_rate_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the AFE_SR
|
||||
*
|
||||
* @Warning The input data should be arranged in the format of channel interleaving.
|
||||
* The last channel is reference signal if it has reference data.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
*
|
||||
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
|
||||
* `get_feed_chunksize`.
|
||||
* @return The size of input
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t *in);
|
||||
|
||||
/**
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR
|
||||
*
|
||||
* @Warning The output is single channel data, no matter how many channels the input is.
|
||||
* Timeout is 2000 ms. If you want to adjust timeout, please refer to the definition of `fetch_with_delay`.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
|
||||
* audio can be queried by the `get_fetch_chunksize`.)
|
||||
*/
|
||||
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief fetch enhanced samples of an audio stream from the AFE_SR, same with the function `fetch`
|
||||
*
|
||||
* @Warning The output is single channel data, no matter how many channels the input is.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param ticks_to_wait The timeout value, in ticks, to wait for the fetch result.
|
||||
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output
|
||||
* audio can be queried by the `get_fetch_chunksize`.)
|
||||
*/
|
||||
typedef afe_fetch_result_t *(*esp_afe_sr_iface_op_fetch_with_delay_t)(esp_afe_sr_data_t *afe, TickType_t ticks_to_wait);
|
||||
|
||||
/**
|
||||
* @brief reset ringbuf of AFE.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Set wakenet detection threshold
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param index The wakenet index, just support 1: wakenet1 or 2: wakenet2
|
||||
* @param threshold The wakenet detection threshold, the value is between 0.4 and 0.9999.
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_set_wakenet_threshold_t)(esp_afe_sr_data_t *afe, int index, float threshold);
|
||||
|
||||
/**
|
||||
* @brief Reset wakenet detection threshold to inital state
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @param index The wakenet index, just support 1: wakenet1 or 2: wakenet2
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_reset_wakenet_threshold_t)(esp_afe_sr_data_t *afe, int index);
|
||||
|
||||
/**
|
||||
* @brief Reset one function/module/algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 1: success
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Disable one function/module/algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 0: disabled, 1: enabled
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_disable_func_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Enable one function/module/algorithm.
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
* @return -1: fail, 0: disabled, 1: enabled
|
||||
*/
|
||||
typedef int (*esp_afe_sr_iface_op_enable_func_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Print all functions/modules/algorithms pipeline.
|
||||
* The pipeline is the order of the functions/modules/algorithms.
|
||||
* The format like this: [input] -> |AEC(VOIP_HIGH_PERF)| -> |WakeNet(wn9_hilexin)| -> [output]
|
||||
*
|
||||
* @param afe The AFE_SR object to query
|
||||
*/
|
||||
typedef void (*esp_afe_sr_iface_op_print_pipeline_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* @brief Destroy a AFE_SR instance
|
||||
*
|
||||
* @param afe AFE_SR object to destroy
|
||||
*/
|
||||
typedef void (*esp_afe_sr_iface_op_destroy_t)(esp_afe_sr_data_t *afe);
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a AFE_SR.
|
||||
*/
|
||||
typedef struct {
|
||||
esp_afe_sr_iface_op_create_from_config_t create_from_config;
|
||||
esp_afe_sr_iface_op_feed_t feed;
|
||||
esp_afe_sr_iface_op_fetch_t fetch;
|
||||
esp_afe_sr_iface_op_fetch_with_delay_t fetch_with_delay;
|
||||
esp_afe_sr_iface_op_reset_buffer_t reset_buffer;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_feed_chunksize;
|
||||
esp_afe_sr_iface_op_get_samp_chunksize_t get_fetch_chunksize;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_channel_num; // same with get_feed_channel_num
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_feed_channel_num;
|
||||
esp_afe_sr_iface_op_get_channel_num_t get_fetch_channel_num;
|
||||
esp_afe_sr_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_afe_sr_iface_op_set_wakenet_threshold_t set_wakenet_threshold;
|
||||
esp_afe_sr_iface_op_reset_wakenet_threshold_t reset_wakenet_threshold;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_wakenet;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_wakenet;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_aec;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_aec;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_se;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_se;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_vad;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_vad;
|
||||
esp_afe_sr_iface_op_reset_op_t reset_vad;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_ns;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_ns;
|
||||
esp_afe_sr_iface_op_disable_func_t disable_agc;
|
||||
esp_afe_sr_iface_op_enable_func_t enable_agc;
|
||||
esp_afe_sr_iface_op_print_pipeline_t print_pipeline;
|
||||
esp_afe_sr_iface_op_destroy_t destroy;
|
||||
} esp_afe_sr_iface_t;
|
||||
|
||||
// struct is used to store the AFE handle and data for the AFE task
|
||||
typedef struct {
|
||||
esp_afe_sr_data_t *afe_data;
|
||||
esp_afe_sr_iface_t *afe_handle;
|
||||
TaskHandle_t feed_task;
|
||||
TaskHandle_t fetch_task;
|
||||
} afe_task_into_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
13
include/esp32s2/esp_afe_sr_models.h
Normal file
13
include/esp32s2/esp_afe_sr_models.h
Normal file
@ -0,0 +1,13 @@
|
||||
#pragma once
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#include "esp_afe_sr_iface.h"
|
||||
|
||||
esp_afe_sr_iface_t *esp_afe_handle_from_config(const afe_config_t *config);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
47
include/esp32s2/esp_agc.h
Normal file
47
include/esp32s2/esp_agc.h
Normal file
@ -0,0 +1,47 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_AGC_H_
|
||||
#define _ESP_AGC_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
////all positive value is valid, negective is error
|
||||
typedef enum {
|
||||
ESP_AGC_SUCCESS = 0, ////success
|
||||
ESP_AGC_FAIL = -1, ////agc fail
|
||||
ESP_AGC_SAMPLE_RATE_ERROR = -2, ///sample rate can be only 8khz, 16khz, 32khz
|
||||
ESP_AGC_FRAME_SIZE_ERROR = -3, ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size
|
||||
} ESP_AGE_ERR;
|
||||
|
||||
typedef enum {
|
||||
AGC_MODE_SR = -1, // Bypass WEBRTC AGC
|
||||
AGC_MODE_0 = 0, // Only saturation protection
|
||||
AGC_MODE_1 = 1, // Analog Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
|
||||
AGC_MODE_2 = 2, // Digital Automatic Gain Control [-targetLevelDbfs (default -3 dBOv)]
|
||||
AGC_MODE_3 = 3, // Fixed Digital Gain [compressionGaindB (default 8 dB)]
|
||||
} agc_mode_t;
|
||||
|
||||
void *esp_agc_open(agc_mode_t agc_mode, int sample_rate);
|
||||
void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs);
|
||||
int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate);
|
||||
void esp_agc_close(void *agc_handle);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif // _ESP_AGC_H_
|
||||
41
include/esp32s2/esp_doa.h
Normal file
41
include/esp32s2/esp_doa.h
Normal file
@ -0,0 +1,41 @@
|
||||
#ifndef _ESP_DOA_H_
|
||||
#define _ESP_DOA_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
typedef struct doa_handle_t doa_handle_t;
|
||||
/**
|
||||
* @brief Initialize SRP-PHAT processor
|
||||
* @param fs Sampling rate (Hz), e.g., 16000
|
||||
* @param resolution Angular search resolution (degrees), e.g., 20
|
||||
* @param d_mics Microphone spacing (meters), e.g., 0.06
|
||||
* @param input_timedate_samples input timedate samples, e.g., 1024
|
||||
* @return Initialized doa_handle_t object pointer, Recommend using the above configuration for better performance
|
||||
*/
|
||||
doa_handle_t *esp_doa_create(int fs, float resolution, float d_mics, int input_timedate_samples);
|
||||
|
||||
/**
|
||||
* @brief Release all allocated resources
|
||||
* @param doa doa_handle_t instance pointer to be freed
|
||||
*/
|
||||
void esp_doa_destroy(doa_handle_t *doa);
|
||||
|
||||
/**
|
||||
* @brief Process audio frame for direction estimation
|
||||
* @param doa doa_handle_t instance pointer
|
||||
* @param left Left channel 16-bit PCM data
|
||||
* @param right Right channel 16-bit PCM data
|
||||
* @return Estimated sound direction in degrees, e.g., 0-180
|
||||
*/
|
||||
float esp_doa_process(doa_handle_t *doa, int16_t* left, int16_t* right);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif /* _ESP_DOA_H_ */
|
||||
93
include/esp32s2/esp_mase.h
Normal file
93
include/esp32s2/esp_mase.h
Normal file
@ -0,0 +1,93 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_MASE_H_
|
||||
#define _ESP_MASE_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define MASE_SAMPLE_RATE 16000 // Supports 16kHz only
|
||||
#define MASE_FRAME_SIZE 16 // Supports 16ms only
|
||||
#define MASE_MIC_DISTANCE 65 // According to physical design of mic-array
|
||||
|
||||
/**
|
||||
* @brief Sets mic-array type, currently 2-mic line array and 3-mic circular array
|
||||
* are supported.
|
||||
*/
|
||||
typedef enum {
|
||||
TWO_MIC_LINE = 0,
|
||||
THREE_MIC_CIRCLE = 1
|
||||
} mase_mic_array_type_t;
|
||||
|
||||
/**
|
||||
* @brief Sets operating mode, supporting normal mode and wake-up enhancement mode
|
||||
*/
|
||||
typedef enum {
|
||||
NORMAL_ENHANCEMENT_MODE = 0,
|
||||
WAKE_UP_ENHANCEMENT_MODE = 1
|
||||
} mase_op_mode_t;
|
||||
|
||||
typedef void* mase_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the MASE structure.
|
||||
*
|
||||
* @param sample_rate The sampling frequency (Hz) must be 16000.
|
||||
*
|
||||
* @param frame_size The length of the audio processing must be 16ms.
|
||||
*
|
||||
* @param array_type '0' for 2-mic line array and '1' for 3-mic circular array.
|
||||
*
|
||||
* @param mic_distance The distance between neiboring microphones in mm.
|
||||
*
|
||||
* @param operating_mode '0' for normal mode and '1' for wake-up enhanced mode.
|
||||
*
|
||||
* @param filter_strength Strengh of the mic-array speech enhancement, must be 0, 1, 2 or 3.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: An instance of MASE
|
||||
*/
|
||||
mase_handle_t mase_create(int fs, int frame_size, int array_type, float mic_distance, int operating_mode, int filter_strength);
|
||||
|
||||
/**
|
||||
* @brief Performs mic array processing for one frame.
|
||||
*
|
||||
* @param inst The instance of MASE.
|
||||
*
|
||||
* @param in An array of 16-bit signed audio samples from mic.
|
||||
*
|
||||
* @param dsp_out Returns enhanced signal.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void mase_process(mase_handle_t st, int16_t *in, int16_t *dsp_out);
|
||||
|
||||
/**
|
||||
* @brief Free the MASE instance
|
||||
*
|
||||
* @param inst The instance of MASE.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void mase_destory(mase_handle_t st);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif
|
||||
@ -12,7 +12,7 @@ esp_mfcc_opts_t *get_mfcc_opts_wn9();
|
||||
/**
|
||||
* @brief Return basic opts used in wakenet9s
|
||||
**/
|
||||
esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
|
||||
esp_mfcc_opts_t *get_mfcc_opts(const char *win_type, bool use_power, int winstep_ms, int winlen_ms, int nfilter);
|
||||
|
||||
/**
|
||||
* @brief Return basic opts for default kaldifeat
|
||||
|
||||
223
include/esp32s2/esp_mn_iface.h
Normal file
223
include/esp32s2/esp_mn_iface.h
Normal file
@ -0,0 +1,223 @@
|
||||
#pragma once
|
||||
#include "stdint.h"
|
||||
#include "esp_wn_iface.h"
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define ESP_MN_RESULT_MAX_NUM 5
|
||||
#define ESP_MN_MAX_PHRASE_NUM 400
|
||||
#define ESP_MN_MAX_PHRASE_LEN 63
|
||||
#define ESP_MN_MIN_PHRASE_LEN 2
|
||||
|
||||
#define ESP_MN_PREFIX "mn"
|
||||
#define ESP_MN_ENGLISH "en"
|
||||
#define ESP_MN_CHINESE "cn"
|
||||
|
||||
typedef enum {
|
||||
ESP_MN_STATE_DETECTING = 0, // detecting
|
||||
ESP_MN_STATE_DETECTED = 1, // detected
|
||||
ESP_MN_STATE_TIMEOUT = 2, // time out
|
||||
} esp_mn_state_t;
|
||||
|
||||
//Set multinet loading mode
|
||||
//The memory comsumption is decreased with increasing mode,
|
||||
//As a consequence also the CPU loading rate goes up
|
||||
typedef enum {
|
||||
ESP_MN_LOAD_FROM_PSRAM = 0, // Load all weights from PSRAM. Fastest computation with Maximum memory consumption
|
||||
ESP_MN_LOAD_FROM_PSRAM_FLASH = 1, // Load some weights from PSRAM and laod the rest from FLASH (default)
|
||||
ESP_MN_LOAD_FROM_FLASH = 2, // Load more weights from FLASH. Minimum memory consumption with slowest computation
|
||||
} esp_mn_loader_mode_t;
|
||||
|
||||
typedef enum {
|
||||
ESP_MN_GREEDY_SEARCH = 0, // greedy search
|
||||
ESP_MN_BEAM_SEARCH = 1, // beam search
|
||||
ESP_MN_BEAM_SEARCH_WITH_FST = 2, // beam search with trie language model
|
||||
} esp_mn_search_method_t;
|
||||
|
||||
typedef enum {
|
||||
CHINESE_ID = 1, // Chinese language
|
||||
ENGLISH_ID = 2, // English language
|
||||
} language_id_t;
|
||||
|
||||
// Return all possible recognition results
|
||||
typedef struct{
|
||||
esp_mn_state_t state;
|
||||
int num; // The number of phrase in list, num<=5. When num=0, no phrase is recognized.
|
||||
int command_id[ESP_MN_RESULT_MAX_NUM]; // The list of command id.
|
||||
int phrase_id[ESP_MN_RESULT_MAX_NUM]; // The list of phrase id.
|
||||
float prob[ESP_MN_RESULT_MAX_NUM]; // The list of probability.
|
||||
char string[256];
|
||||
} esp_mn_results_t;
|
||||
|
||||
typedef struct {
|
||||
char *string; // command string
|
||||
char *phonemes; // command phonemes, if applicable
|
||||
int16_t command_id; // the command id
|
||||
float threshold; // trigger threshold, default: 0
|
||||
int16_t *wave; // prompt wave data of the phrase
|
||||
} esp_mn_phrase_t;
|
||||
|
||||
typedef struct _mn_node_ {
|
||||
esp_mn_phrase_t *phrase;
|
||||
struct _mn_node_ *next;
|
||||
} esp_mn_node_t;
|
||||
|
||||
typedef struct{
|
||||
int16_t num; // The number of error phrases, which can not added into model
|
||||
esp_mn_phrase_t **phrases; // The array of error phrase pointer
|
||||
} esp_mn_error_t;
|
||||
|
||||
/**
|
||||
* @brief Initialze a model instance with specified model name.
|
||||
*
|
||||
* @param model_name The wakenet model name.
|
||||
* @param duration The duration (ms) to trigger the timeout
|
||||
*
|
||||
* @returns Handle to the model data.
|
||||
*/
|
||||
typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const char *model_name, int duration);
|
||||
|
||||
/**
|
||||
* @brief Switch multinet mode to change memory consumption and CPU loading
|
||||
*
|
||||
* @warning Just Support multinet6 or later versions
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param mode The multinet loader mode
|
||||
*
|
||||
* @returns Handle to the model data.
|
||||
*/
|
||||
typedef model_iface_data_t* (*esp_mn_iface_op_switch_loader_mode_t)(model_iface_data_t *model, esp_mn_loader_mode_t mode);
|
||||
|
||||
/**
|
||||
* @brief Callback function type to fetch the amount of samples that need to be passed to the detect function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the same time. This function
|
||||
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Callback function type to fetch the number of frames recognized by the command word
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The number of the frames recognized by the command word
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_get_samp_chunknum_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Set the detection threshold to manually abjust the probability
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param det_treshold The threshold to trigger speech commands, the range of det_threshold is 0.0~0.9999
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the detect function
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the language of model
|
||||
*
|
||||
* @param model The language name
|
||||
* @return Language name string defined in esp_mn_models.h, eg: ESP_MN_CHINESE, ESP_MN_ENGLISH
|
||||
*/
|
||||
typedef char * (*esp_mn_iface_op_get_language_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the speech recognition model and detect if there is a speech command found.
|
||||
*
|
||||
* @param model The model object to query.
|
||||
* @param samples An array of 16-bit signed audio samples. The array size used can be queried by the
|
||||
* get_samp_chunksize function.
|
||||
* @return The state of multinet
|
||||
*/
|
||||
typedef esp_mn_state_t (*esp_mn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
|
||||
|
||||
/**
|
||||
* @brief Destroy a speech commands recognition model
|
||||
*
|
||||
* @param model The Model object to destroy
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_destroy_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get recognition results
|
||||
*
|
||||
* @param model The Model object to query
|
||||
*
|
||||
* @return The current results.
|
||||
*/
|
||||
typedef esp_mn_results_t* (*esp_mn_iface_op_get_results_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Open the log print
|
||||
*
|
||||
* @param model_data The model object to query.
|
||||
*
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_open_log_t)(model_iface_data_t *model_data);
|
||||
|
||||
/**
|
||||
* @brief Clean all status of model
|
||||
*
|
||||
* @param model_data The model object to query.
|
||||
*
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_clean_t)(model_iface_data_t *model_data);
|
||||
|
||||
/**
|
||||
* @brief Set the speech commands by mn_command_root
|
||||
*
|
||||
* @param model_data The model object to query.
|
||||
* @param mn_command_root The speech commands link.
|
||||
* @return The error phrase id info.
|
||||
*/
|
||||
typedef esp_mn_error_t* (*esp_wn_iface_op_set_speech_commands)(model_iface_data_t *model_data, esp_mn_node_t *mn_command_root);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Print out current commands in fst, note the ones "added" but not "updated" will not be shown here
|
||||
*
|
||||
* @param model_data The model object to query
|
||||
*/
|
||||
typedef void (*esp_mn_iface_op_print_active_speech_commands)(model_iface_data_t *model_data);
|
||||
|
||||
/**
|
||||
* @brief Check if input string can be tokenized
|
||||
*
|
||||
* @param model_data The model object to query
|
||||
* @param str The input string
|
||||
*/
|
||||
typedef int (*esp_mn_iface_op_check_speech_command)(model_iface_data_t *model_data, const char *str);
|
||||
|
||||
typedef struct {
|
||||
esp_mn_iface_op_create_t create;
|
||||
esp_mn_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_mn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
||||
esp_mn_iface_op_get_samp_chunknum_t get_samp_chunknum;
|
||||
esp_mn_iface_op_set_det_threshold_t set_det_threshold;
|
||||
esp_mn_iface_op_get_language_t get_language;
|
||||
esp_mn_iface_op_detect_t detect;
|
||||
esp_mn_iface_op_destroy_t destroy;
|
||||
esp_mn_iface_op_get_results_t get_results;
|
||||
esp_mn_iface_op_open_log_t open_log;
|
||||
esp_mn_iface_op_clean_t clean;
|
||||
esp_wn_iface_op_set_speech_commands set_speech_commands;
|
||||
esp_mn_iface_op_switch_loader_mode_t switch_loader_mode;
|
||||
esp_mn_iface_op_print_active_speech_commands print_active_speech_commands;
|
||||
esp_mn_iface_op_check_speech_command check_speech_command;
|
||||
} esp_mn_iface_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
66
include/esp32s2/esp_mn_models.h
Normal file
66
include/esp32s2/esp_mn_models.h
Normal file
@ -0,0 +1,66 @@
|
||||
#pragma once
|
||||
#include "esp_mn_iface.h"
|
||||
|
||||
//Contains declarations of all available speech recognion models. Pair this up with the right coefficients and you have a model that can recognize
|
||||
//a specific phrase or word.
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
/**
|
||||
* @brief Get the multinet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of multinet
|
||||
*/
|
||||
esp_mn_iface_t *esp_mn_handle_from_name(char *model_name);
|
||||
|
||||
/**
|
||||
* @brief Get the multinet language from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The language of multinet
|
||||
*/
|
||||
char *esp_mn_language_from_name(char *model_name);
|
||||
|
||||
/*
|
||||
Configure wake word to use based on what's selected in menuconfig.
|
||||
*/
|
||||
|
||||
#ifdef CONFIG_SR_MN_CN_MULTINET2_SINGLE_RECOGNITION
|
||||
#include "multinet2_ch.h"
|
||||
#define MULTINET_COEFF get_coeff_multinet2_ch
|
||||
#define MULTINET_MODEL_NAME "mn2_cn"
|
||||
|
||||
#else
|
||||
#define MULTINET_COEFF "COEFF_NULL"
|
||||
#define MULTINET_MODEL_NAME "NULL"
|
||||
#endif
|
||||
|
||||
|
||||
/* example
|
||||
|
||||
static const esp_mn_iface_t *multinet = &MULTINET_MODEL;
|
||||
|
||||
//Initialize MultiNet model data
|
||||
model_iface_data_t *model_data = multinet->create(&MULTINET_COEFF);
|
||||
add_speech_commands(multinet, model_data);
|
||||
|
||||
//Set parameters of buffer
|
||||
int audio_chunksize=model->get_samp_chunksize(model_data);
|
||||
int frequency = model->get_samp_rate(model_data);
|
||||
int16_t *buffer=malloc(audio_chunksize*sizeof(int16_t));
|
||||
|
||||
//Detect
|
||||
int r=model->detect(model_data, buffer);
|
||||
if (r>0) {
|
||||
printf("Detection triggered output %d.\n", r);
|
||||
}
|
||||
|
||||
//Destroy model
|
||||
model->destroy(model_data)
|
||||
|
||||
*/
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
86
include/esp32s2/esp_ns.h
Normal file
86
include/esp32s2/esp_ns.h
Normal file
@ -0,0 +1,86 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_NS_H_
|
||||
#define _ESP_NS_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define NS_USE_SPIARM 0
|
||||
#define NS_FRAME_LENGTH_MS 10 //Supports 10ms, 20ms, 30ms
|
||||
|
||||
/**
|
||||
* The Sampling frequency (Hz) must be 16000Hz
|
||||
*/
|
||||
|
||||
typedef void* ns_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the NS structure.
|
||||
*
|
||||
* @param frame_length The length of the audio processing can be 10ms, 20ms, 30ms.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of NS
|
||||
*/
|
||||
ns_handle_t ns_create(int frame_length);
|
||||
|
||||
/**
|
||||
* @brief Creates an instance of the more powerful noise suppression algorithm.
|
||||
*
|
||||
* @warning frame_length only supports be 10 ms.
|
||||
*
|
||||
* @param frame_length The length of the audio processing can only be 10ms.
|
||||
* @param mode 0: Mild, 1: Medium, 2: Aggressive
|
||||
* @param sample_rate The sample rate of the audio.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of NS
|
||||
*/
|
||||
ns_handle_t ns_pro_create(int frame_length, int mode, int sample_rate);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the NS and get the audio stream after Noise suppression.
|
||||
*
|
||||
* @param inst The instance of NS.
|
||||
*
|
||||
* @param indata An array of 16-bit signed audio samples.
|
||||
*
|
||||
* @param outdata An array of 16-bit signed audio samples after noise suppression.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void ns_process(ns_handle_t inst, int16_t *indata, int16_t *outdata);
|
||||
|
||||
/**
|
||||
* @brief Free the NS instance
|
||||
*
|
||||
* @param inst The instance of NS.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void ns_destroy(ns_handle_t inst);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //_ESP_NS_H_
|
||||
64
include/esp32s2/esp_nsn_iface.h
Normal file
64
include/esp32s2/esp_nsn_iface.h
Normal file
@ -0,0 +1,64 @@
|
||||
#pragma once
|
||||
#include "stdint.h"
|
||||
|
||||
//Opaque model data container
|
||||
typedef struct esp_nsn_data_t esp_nsn_data_t;
|
||||
|
||||
|
||||
/**
|
||||
* @brief Easy function type to initialze a model instance
|
||||
*
|
||||
* @param model_name The name of the model instance
|
||||
* @returns Handle to the model data
|
||||
*/
|
||||
typedef esp_nsn_data_t* (*esp_nsn_iface_op_create_t)(char *model_name);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of samples that need to be passed to the process function
|
||||
*
|
||||
* Every noise suppression model processes a certain number of samples at the same time. This function
|
||||
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the process function
|
||||
*/
|
||||
typedef int (*esp_nsn_iface_op_get_samp_chunksize_t)(esp_nsn_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the noise suppression model and get data after process.
|
||||
*
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param in_data An array of 16-bit signed audio samples. The array size used can be queried by the
|
||||
* get_samp_chunksize function.
|
||||
* @param out_data An array of 16-bit signed audio samples after process.
|
||||
* @return The state of return.
|
||||
*/
|
||||
typedef int (*esp_nsn_iface_op_process_t)(esp_nsn_data_t *model, int16_t *in_data, int16_t *out_data);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the process function
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_nsn_iface_op_get_samp_rate_t)(esp_nsn_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Destroy a noise suppression model
|
||||
*
|
||||
* @param model Model object to destroy
|
||||
*/
|
||||
typedef void (*esp_nsn_iface_op_destroy_t)(esp_nsn_data_t *model);
|
||||
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a wake word detection model.
|
||||
*/
|
||||
typedef struct {
|
||||
esp_nsn_iface_op_create_t create;
|
||||
esp_nsn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
||||
esp_nsn_iface_op_process_t process;
|
||||
esp_nsn_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_nsn_iface_op_destroy_t destroy;
|
||||
} esp_nsn_iface_t;
|
||||
17
include/esp32s2/esp_nsn_models.h
Normal file
17
include/esp32s2/esp_nsn_models.h
Normal file
@ -0,0 +1,17 @@
|
||||
#pragma once
|
||||
|
||||
#include "esp_nsn_iface.h"
|
||||
|
||||
/*
|
||||
The prefix of nset
|
||||
Now there are nsnet1 and nsnet2
|
||||
*/
|
||||
#define ESP_NSNET_PREFIX "nsnet"
|
||||
|
||||
/**
|
||||
* @brief Get the nsnet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of multinet
|
||||
*/
|
||||
esp_nsn_iface_t *esp_nsnet_handle_from_name(char *model_name);
|
||||
@ -48,7 +48,7 @@ float *esp_win_func_init(char *win_type, float *window_data, int frame_length);
|
||||
|
||||
float *esp_fftr(float *x, int nfft, void *fft_table);
|
||||
|
||||
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_table);
|
||||
float *esp_spectrum_step(float *x, int nfft, bool use_power, void *fft_handle);
|
||||
|
||||
void esp_audio_short_to_float(short *samples, float *x, int len, int remove_dc);
|
||||
|
||||
|
||||
84
include/esp32s2/esp_sr_webrtc.h
Normal file
84
include/esp32s2/esp_sr_webrtc.h
Normal file
@ -0,0 +1,84 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_WEBRTC_H_
|
||||
#define _ESP_WEBRTC_H_
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
#include "esp_agc.h"
|
||||
#include "esp_log.h"
|
||||
#include "esp_ns.h"
|
||||
#include "sr_ringbuf.h"
|
||||
#include <stdint.h>
|
||||
|
||||
#include "esp_heap_caps.h"
|
||||
|
||||
typedef struct {
|
||||
void *ns_handle;
|
||||
void *agc_handle;
|
||||
int frame_size;
|
||||
int sample_rate;
|
||||
int16_t *buff;
|
||||
int16_t *out_data;
|
||||
sr_ringbuf_handle_t rb;
|
||||
} webrtc_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance of webrtc.
|
||||
*
|
||||
* @warning frame_length can supports be 10 ms, 20 ms, 30 ms, 32 ms.
|
||||
*
|
||||
* @param frame_length_ms The length of the audio processing
|
||||
* @param ns_mode The mode of NS. -1 means NS is disabled. 0: Mild, 1: Medium, 2: Aggressive
|
||||
* @param agc_mode The model of AGC
|
||||
* @param agc_gain The gain of AGC. default is 9
|
||||
* @param agc_target_level The target level of AGC. default is -3 dbfs
|
||||
* @param sample_rate The sample rate of the audio.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of webrtc
|
||||
*/
|
||||
webrtc_handle_t *webrtc_create(
|
||||
int frame_length_ms, int ns_mode, agc_mode_t agc_mode, int agc_gain, int agc_target_level, int sample_rate);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the webrtc and get the audio stream after Noise suppression.
|
||||
*
|
||||
* @param handle The instance of NS.
|
||||
* @param in_data An array of 16-bit signed audio samples.
|
||||
* @param out_size The sample size of output data
|
||||
* @param enable_ns Enable noise suppression
|
||||
* @param enable_agc Enable automatic gain control
|
||||
*
|
||||
* @return data after noise suppression
|
||||
*/
|
||||
int16_t *webrtc_process(webrtc_handle_t *handle, int16_t *indata, int *size, bool enable_ns, bool enable_agc);
|
||||
|
||||
/**
|
||||
* @brief Free the webrtc instance
|
||||
*
|
||||
* @param handle The instance of webrtc.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void webrtc_destroy(webrtc_handle_t *handle);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //_ESP_NS_H_
|
||||
178
include/esp32s2/esp_vad.h
Normal file
178
include/esp32s2/esp_vad.h
Normal file
@ -0,0 +1,178 @@
|
||||
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
|
||||
//
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
//
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License
|
||||
#ifndef _ESP_VAD_H_
|
||||
#define _ESP_VAD_H_
|
||||
|
||||
#include <stdint.h>
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#define SAMPLE_RATE_HZ 16000 // Supports 32000, 16000, 8000
|
||||
#define VAD_FRAME_LENGTH_MS 30 // Supports 10ms, 20ms, 30ms
|
||||
|
||||
/**
|
||||
* @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
|
||||
* restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
|
||||
*/
|
||||
typedef enum {
|
||||
VAD_MODE_0 = 0, // Normal
|
||||
VAD_MODE_1, // Aggressive
|
||||
VAD_MODE_2, // Very Aggressive
|
||||
VAD_MODE_3, // Very Very Aggressive
|
||||
VAD_MODE_4 // Very Very Very Aggressive
|
||||
} vad_mode_t;
|
||||
|
||||
typedef enum {
|
||||
VAD_SILENCE = 0,
|
||||
VAD_SPEECH = 1,
|
||||
} vad_state_t;
|
||||
|
||||
typedef struct vad_trigger_tag {
|
||||
vad_state_t state;
|
||||
unsigned int min_speech_len;
|
||||
unsigned int noise_len;
|
||||
unsigned int min_noise_len;
|
||||
unsigned int speech_len;
|
||||
} vad_trigger_t;
|
||||
|
||||
#define vad_MAX_LEN INT32_MAX - 1
|
||||
/**
|
||||
* @brief Allocate wakenet trigger
|
||||
*
|
||||
* @param min_speech_len Minimum frame number of speech duration
|
||||
* @param min_noise_len Minimum frame number of noise duration
|
||||
*
|
||||
* @return Trigger pointer
|
||||
**/
|
||||
vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
|
||||
|
||||
/**
|
||||
* @brief Free wakenet trigger
|
||||
**/
|
||||
void vad_trigger_free(vad_trigger_t *trigger);
|
||||
|
||||
/**
|
||||
* @brief Reset wakenet trigger
|
||||
**/
|
||||
void vad_trigger_reset(vad_trigger_t *trigger);
|
||||
|
||||
/**
|
||||
* @brief detect activaty voice by trigger
|
||||
**/
|
||||
vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
|
||||
|
||||
typedef struct {
|
||||
vad_trigger_t *trigger;
|
||||
void *vad_inst;
|
||||
int sample_rate;
|
||||
int frame_size;
|
||||
} vad_handle_with_trigger_t;
|
||||
|
||||
typedef vad_handle_with_trigger_t *vad_handle_t;
|
||||
|
||||
// typedef vad_handle_tag * vad_handle_t;
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the VAD structure.
|
||||
*
|
||||
* @param vad_mode Sets the VAD operating mode.
|
||||
*
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of VAD
|
||||
*/
|
||||
vad_handle_t vad_create(vad_mode_t vad_mode);
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the VAD structure.
|
||||
*
|
||||
* @param vad_mode Sets the VAD operating mode.
|
||||
* @param sample_rate Sample rate in Hz
|
||||
* @param one_frame_ms Length of the audio chunksize, can be 10ms, 20ms, 30ms, default: 30.
|
||||
* @param min_speech_ms Minimum speech duration, unit is ms
|
||||
* @param min_noise_ms Minimum noise duration, unit is ms
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of VAD
|
||||
*/
|
||||
vad_handle_t vad_create_with_param(
|
||||
vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
|
||||
*
|
||||
* @param handle The instance of VAD.
|
||||
* @param data An array of 16-bit signed audio samples.
|
||||
* @param sample_rate_hz The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
|
||||
* @param one_frame_ms The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
|
||||
* @return
|
||||
* - VAD_SILENCE if no voice
|
||||
* - VAD_SPEECH if voice is detected
|
||||
*
|
||||
*/
|
||||
vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, int one_frame_ms);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
|
||||
*
|
||||
* @param handle The instance of VAD.
|
||||
* @param data An array of 16-bit signed audio samples.
|
||||
* @return
|
||||
* - VAD_SILENCE if no voice
|
||||
* - VAD_SPEECH if voice is detected
|
||||
*
|
||||
*/
|
||||
vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
|
||||
|
||||
/**
|
||||
* @brief Reset trigger state as Silence
|
||||
*
|
||||
* @param handle The instance of VAD.
|
||||
*/
|
||||
void vad_reset_trigger(vad_handle_t handle);
|
||||
|
||||
/**
|
||||
* @brief Free the VAD instance
|
||||
*
|
||||
* @param inst The instance of VAD.
|
||||
*
|
||||
* @return None
|
||||
*
|
||||
*/
|
||||
void vad_destroy(vad_handle_t inst);
|
||||
|
||||
/*
|
||||
* Programming Guide:
|
||||
*
|
||||
* @code{c}
|
||||
* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS); // Creates an instance to
|
||||
* the VAD structure.
|
||||
*
|
||||
* while (1) {
|
||||
* //Use buffer to receive the audio data from MIC.
|
||||
* vad_state_t vad_state = vad_process(vad_inst, buffer); // Feed samples to the VAD process and get the result.
|
||||
* }
|
||||
*
|
||||
* vad_destroy(vad_inst); // Free the VAD instance at the end of whole VAD process
|
||||
*
|
||||
* @endcode
|
||||
*/
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
|
||||
#endif //_ESP_VAD_H_
|
||||
164
include/esp32s2/esp_vadn_iface.h
Normal file
164
include/esp32s2/esp_vadn_iface.h
Normal file
@ -0,0 +1,164 @@
|
||||
#pragma once
|
||||
#include "esp_vad.h"
|
||||
#include "stdint.h"
|
||||
#include "dl_lib_convq_queue.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Opaque model data container
|
||||
typedef struct model_iface_data_t model_iface_data_t;
|
||||
|
||||
// /**
|
||||
// * @brief The state of vad
|
||||
// */
|
||||
// typedef enum {
|
||||
// VAD_NOISE = -1, // Noise
|
||||
// VADNET_STATE_SILENCE = 0, // Silence
|
||||
// VAD_SPEECH = 1 // Speech
|
||||
// } vad_state_t;
|
||||
|
||||
/**
|
||||
* @brief Easy function type to initialze a model instance with a detection mode
|
||||
* and specified model name
|
||||
*
|
||||
* @param model_name The specified model name
|
||||
* @param mode The voice activity detection mode
|
||||
* @param channel_num The number of input audio channels
|
||||
* @param min_speech_ms The minimum duration of speech in ms to trigger vad
|
||||
* speech
|
||||
* @param min_noise_ms The minimum duration of noise in ms to trigger vad
|
||||
* noise
|
||||
* @returns Handle to the model data
|
||||
*/
|
||||
typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)(
|
||||
const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of samples that need to be passed to the detect
|
||||
* function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the
|
||||
* same time. This function can be used to query that amount. Note that the
|
||||
* returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the channel number of samples that need to be passed to the detect
|
||||
* function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the
|
||||
* same time. This function can be used to query that amount. Note that the
|
||||
* returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the detect function
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Set the detection threshold to manually abjust the probability
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param det_treshold The threshold to trigger wake words, the range of
|
||||
* det_threshold is 0.5~0.9999
|
||||
* @return 0: setting failed, 1: setting success
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
|
||||
|
||||
/**
|
||||
* @brief Get the voice activity detection threshold
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @returns the detection threshold
|
||||
*/
|
||||
typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the vad model and detect whether is
|
||||
* voice.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param samples An array of 16-bit signed audio samples. The array size used
|
||||
* can be queried by the get_samp_chunksize function.
|
||||
* @return The index of wake words, return 0 if no wake word is detected, else
|
||||
* the index of the wake words.
|
||||
*/
|
||||
typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
|
||||
|
||||
/**
|
||||
* @brief Feed MFCC of an audio stream to the vad model and detect whether is
|
||||
* voice.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param cq An array of 16-bit MFCC.
|
||||
* @return The index of wake words, return 0 if no wake word is detected, else
|
||||
* the index of the wake words.
|
||||
*/
|
||||
typedef vad_state_t (*esp_vadn_iface_op_detect_mfcc_t)(model_iface_data_t *model, dl_convq_queue_t *cq);
|
||||
|
||||
/**
|
||||
* @brief Get MFCC of an audio stream
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return MFCC data
|
||||
*/
|
||||
typedef dl_convq_queue_t* (*esp_vadn_iface_op_get_mfcc_data_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the triggered channel index. Channel index starts from zero
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The channel index
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Clean all states of model
|
||||
*
|
||||
* @param model The model object to query
|
||||
*/
|
||||
typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Destroy a model object
|
||||
*
|
||||
* @param model Model object to destroy
|
||||
*/
|
||||
typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a voice
|
||||
* activity detection model.
|
||||
*/
|
||||
typedef struct {
|
||||
esp_vadn_iface_op_create_t create;
|
||||
esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
||||
esp_vadn_iface_op_get_channel_num_t get_channel_num;
|
||||
esp_vadn_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_vadn_iface_op_set_det_threshold_t set_det_threshold;
|
||||
esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
|
||||
esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
|
||||
esp_vadn_iface_op_detect_t detect;
|
||||
esp_vadn_iface_op_detect_mfcc_t detect_mfcc;
|
||||
esp_vadn_iface_op_get_mfcc_data_t get_mfcc_data;
|
||||
esp_vadn_iface_op_clean_t clean;
|
||||
esp_vadn_iface_op_destroy_t destroy;
|
||||
} esp_vadn_iface_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
22
include/esp32s2/esp_vadn_models.h
Normal file
22
include/esp32s2/esp_vadn_models.h
Normal file
@ -0,0 +1,22 @@
|
||||
#pragma once
|
||||
#include "esp_vadn_iface.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// The prefix of vadnet model name is used to filter all wakenet from availabel models.
|
||||
#define ESP_VADN_PREFIX "vadnet"
|
||||
|
||||
/**
|
||||
* @brief Get the wakenet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of wakenet
|
||||
*/
|
||||
const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@ -29,6 +29,7 @@ typedef enum {
|
||||
DET_MODE_2CH_95 = 3,
|
||||
DET_MODE_3CH_90 = 4,
|
||||
DET_MODE_3CH_95 = 5,
|
||||
DET_MODE_90_COPY_PARAMS = 6, // Aggressive
|
||||
} det_mode_t;
|
||||
|
||||
typedef struct {
|
||||
@ -110,12 +111,21 @@ typedef char* (*esp_wn_iface_op_get_word_name_t)(model_iface_data_t *model, int
|
||||
* @brief Set the detection threshold to manually abjust the probability
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.5~0.9999
|
||||
* @param det_treshold The threshold to trigger wake words, the range of det_threshold is 0.4~0.9999
|
||||
* @param word_index The index of wake word
|
||||
* @return 0: setting failed, 1: setting success
|
||||
*/
|
||||
typedef int (*esp_wn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold, int word_index);
|
||||
|
||||
/**
|
||||
* @brief Reset the threshold to its initial state
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return 0: setting failed, 1: setting success
|
||||
*/
|
||||
typedef int (*esp_wn_iface_op_reset_det_threshold_t)(model_iface_data_t *model);
|
||||
|
||||
|
||||
/**
|
||||
* @brief Get the wake word detection threshold of different modes
|
||||
*
|
||||
@ -200,6 +210,7 @@ typedef struct {
|
||||
esp_wn_iface_op_get_word_num_t get_word_num;
|
||||
esp_wn_iface_op_get_word_name_t get_word_name;
|
||||
esp_wn_iface_op_set_det_threshold_t set_det_threshold;
|
||||
esp_wn_iface_op_reset_det_threshold_t reset_det_threshold;
|
||||
esp_wn_iface_op_get_det_threshold_t get_det_threshold;
|
||||
esp_wn_iface_op_get_triggered_channel_t get_triggered_channel;
|
||||
esp_wn_iface_op_get_vol_gain_t get_vol_gain;
|
||||
|
||||
@ -11,7 +11,7 @@ extern "C" {
|
||||
/**
|
||||
* @brief Get the wakenet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of wakenet
|
||||
*/
|
||||
const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
|
||||
@ -19,10 +19,10 @@ const esp_wn_iface_t *esp_wn_handle_from_name(const char *model_name);
|
||||
/**
|
||||
* @brief Get the wake word name from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @param model_name The name of model
|
||||
* @returns The wake word name, like "alexa","hilexin","xiaoaitongxue"
|
||||
*/
|
||||
char* esp_wn_wakeword_from_name(const char *model_name);
|
||||
char *esp_wn_wakeword_from_name(const char *model_name);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
||||
20
include/esp32s2/flite_g2p.h
Normal file
20
include/esp32s2/flite_g2p.h
Normal file
@ -0,0 +1,20 @@
|
||||
#ifndef __FLITE_G2P_H__
|
||||
#define __FLITE_G2P_H__
|
||||
|
||||
typedef struct {
|
||||
int num_phonemes;
|
||||
int phoneme_size;
|
||||
char **phonemes;
|
||||
} flite_g2p_result;
|
||||
|
||||
void flite_g2p_result_free(flite_g2p_result *result);
|
||||
|
||||
flite_g2p_result *flite_g2p_get_result(const char *grapheme);
|
||||
|
||||
void flite_g2p_result_print_string(flite_g2p_result *result, int map_phonemes);
|
||||
|
||||
char *flite_g2p_result_get_string(flite_g2p_result *result, int map_phonemes);
|
||||
|
||||
char *flite_g2p(const char *graphemes, int map_phonemes);
|
||||
|
||||
#endif
|
||||
@ -12,7 +12,7 @@ esp_mfcc_opts_t *get_mfcc_opts_wn9();
|
||||
/**
|
||||
* @brief Return basic opts used in wakenet9s
|
||||
**/
|
||||
esp_mfcc_opts_t *get_mfcc_opts_wn9s16();
|
||||
esp_mfcc_opts_t *get_mfcc_opts(const char *win_type, bool use_power, int winstep_ms, int winlen_ms, int nfilter);
|
||||
|
||||
/**
|
||||
* @brief Return basic opts for default kaldifeat
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user