feat: add vadnet model and update vad interface

This commit is contained in:
xysun 2024-12-31 19:14:59 +08:00
parent 586e7eae25
commit 404fa46e38
22 changed files with 253 additions and 21 deletions

View File

@ -74,6 +74,7 @@ elseif(${IDF_TARGET} STREQUAL "esp32s3")
add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME})
@ -95,6 +96,7 @@ elseif(${IDF_TARGET} STREQUAL "esp32s3")
esp_tts_chinese
voice_set_xiaole
nsnet
vadnet
wakenet
"-Wl,--end-group")
@ -153,6 +155,7 @@ elseif(${IDF_TARGET} STREQUAL "esp32p4")
add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME})
add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME})
@ -173,6 +176,7 @@ elseif(${IDF_TARGET} STREQUAL "esp32p4")
esp_tts_chinese
voice_set_xiaole
wakenet
vadnet
nsnet
"-Wl,--end-group")

View File

@ -92,6 +92,10 @@ typedef struct {
char *afe_ns_model_name;
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
// otherwise, select channel number by wakenet
char *vad_model_name; // The model name of vad, support vadnet1 and vadnet1_small
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms
int vad_min_noise_ms; // The minimum duration of noise/silence in ms. It should be bigger than 64 ms
bool vad_mute_playback; // If true, the playback will be muted for vad detection
} afe_config_t;
@ -126,6 +130,10 @@ typedef struct {
.afe_ns_mode = NS_MODE_SSP, \
.afe_ns_model_name = NULL, \
.fixed_first_channel = true, \
.vad_model_name = NULL, \
.vad_min_speech_ms = 64, \
.vad_min_noise_ms = 256, \
.vad_mute_playback = false, \
}
#elif CONFIG_IDF_TARGET_ESP32P4
#define AFE_CONFIG_DEFAULT() { \
@ -158,6 +166,10 @@ typedef struct {
.afe_ns_mode = NS_MODE_SSP, \
.afe_ns_model_name = NULL, \
.fixed_first_channel = true, \
.vad_model_name = NULL, \
.vad_min_speech_ms = 64, \
.vad_min_noise_ms = 256, \
.vad_mute_playback = false, \
}
#elif CONFIG_IDF_TARGET_ESP32S3
#define AFE_CONFIG_DEFAULT() { \
@ -190,6 +202,10 @@ typedef struct {
.afe_ns_mode = NS_MODE_SSP, \
.afe_ns_model_name = NULL, \
.fixed_first_channel = true, \
.vad_model_name = NULL, \
.vad_min_speech_ms = 64, \
.vad_min_noise_ms = 256, \
.vad_mute_playback = false, \
}
#endif

View File

@ -29,6 +29,8 @@ typedef struct afe_fetch_result_t
{
int16_t *data; // the data of audio.
int data_size; // the size of data. The unit is byte.
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
int vad_cache_size; // the size of vad_cache. The unit is byte.
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc).
// if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length.
wakenet_state_t wakeup_state; // the value is wakenet_state_t
@ -36,7 +38,7 @@ typedef struct afe_fetch_result_t
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
afe_vad_state_t vad_state; // the value is afe_vad_state_t
int trigger_channel_id; // the channel index of output
int wake_word_length; // the length of wake word. It's unit is the number of samples.
int wake_word_length; // the length of wake word. The unit is the number of samples.
int ret_value; // the return state of fetch function
void* reserved; // reserved for future use
} afe_fetch_result_t;

View File

@ -4,7 +4,6 @@
extern "C" {
#endif
#if defined CONFIG_USE_AFE
#include "esp_afe_sr_iface.h"
@ -19,17 +18,6 @@ extern const esp_afe_sr_iface_t esp_afe_vc_v1;
#endif
#else
#include "esp_afe_sr_iface.h"
extern const esp_afe_sr_iface_t esp_afe_sr_v1;
extern const esp_afe_sr_iface_t esp_afe_vc_v1;
#define ESP_AFE_SR_HANDLE esp_afe_sr_v1
#define ESP_AFE_VC_HANDLE esp_afe_vc_v1
#endif
#ifdef __cplusplus
}
#endif

View File

@ -25,22 +25,65 @@ extern "C" {
/**
* @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
* restrictive in reporting speech.
* restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
*/
typedef enum {
VAD_MODE_0 = 0,
VAD_MODE_1,
VAD_MODE_2,
VAD_MODE_3,
VAD_MODE_4
VAD_MODE_0 = 0, // Normal
VAD_MODE_1, // Aggressive
VAD_MODE_2, // Very Aggressive
VAD_MODE_3, // Very Very Aggressive
VAD_MODE_4 // Very Very Very Aggressive
} vad_mode_t;
typedef enum {
VAD_SILENCE = 0,
VAD_SPEECH
VAD_SPEECH = 1,
} vad_state_t;
typedef void* vad_handle_t;
typedef struct vad_trigger_tag {
vad_state_t state;
unsigned int min_speech_len;
unsigned int noise_len;
unsigned int min_noise_len;
unsigned int speech_len;
} vad_trigger_t;
#define vad_MAX_LEN INT32_MAX - 1
/**
* @brief Allocate wakenet trigger
*
* @param min_speech_len Minimum frame number of speech duration
* @param min_noise_len Minimum frame number of noise duration
*
* @return Trigger pointer
**/
vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
/**
* @brief Free wakenet trigger
**/
void vad_trigger_free(vad_trigger_t *trigger);
/**
* @brief Reset wakenet trigger
**/
void vad_trigger_reset(vad_trigger_t *trigger);
/**
* @brief detect activaty voice by trigger
**/
vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
typedef struct {
vad_trigger_t *trigger;
void *vad_inst;
}vad_handle_with_trigger_t;
typedef vad_handle_with_trigger_t* vad_handle_t;
// typedef vad_handle_tag * vad_handle_t;
/**
* @brief Creates an instance to the VAD structure.
@ -53,6 +96,18 @@ typedef void* vad_handle_t;
*/
vad_handle_t vad_create(vad_mode_t vad_mode);
/**
* @brief Creates an instance to the VAD structure.
*
* @param vad_mode Sets the VAD operating mode.
* @param min_speech_len Minimum frame number of speech duration
* @param min_noise_len Minimum frame number of noise duration
* @return
* - NULL: Create failed
* - Others: The instance of VAD
*/
vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len);
/**
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
*

View File

@ -0,0 +1,142 @@
#pragma once
#include "esp_vad.h"
#include "stdint.h"
#ifdef __cplusplus
extern "C" {
#endif
// Opaque model data container
typedef struct model_iface_data_t model_iface_data_t;
// /**
// * @brief The state of vad
// */
// typedef enum {
// VAD_NOISE = -1, // Noise
// VADNET_STATE_SILENCE = 0, // Silence
// VAD_SPEECH = 1 // Speech
// } vad_state_t;
/**
* @brief Easy function type to initialze a model instance with a detection mode
* and specified model name
*
* @param model_name The specified model name
* @param mode The voice activity detection mode
* @param channel_num The number of input audio channels
* @param min_speech_ms The minimum duration of speech in ms to trigger vad
* speech
* @param min_noise_ms The minimum duration of noise in ms to trigger vad
* noise
* @returns Handle to the model data
*/
typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)(
const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms);
/**
* @brief Get the amount of samples that need to be passed to the detect
* function
*
* Every speech recognition model processes a certain number of samples at the
* same time. This function can be used to query that amount. Note that the
* returned amount is in 16-bit samples, not in bytes.
*
* @param model The model object to query
* @return The amount of samples to feed the detect function
*/
typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
/**
* @brief Get the channel number of samples that need to be passed to the detect
* function
*
* Every speech recognition model processes a certain number of samples at the
* same time. This function can be used to query that amount. Note that the
* returned amount is in 16-bit samples, not in bytes.
*
* @param model The model object to query
* @return The amount of samples to feed the detect function
*/
typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model);
/**
* @brief Get the sample rate of the samples to feed to the detect function
*
* @param model The model object to query
* @return The sample rate, in hz
*/
typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
/**
* @brief Set the detection threshold to manually abjust the probability
*
* @param model The model object to query
* @param det_treshold The threshold to trigger wake words, the range of
* det_threshold is 0.5~0.9999
* @return 0: setting failed, 1: setting success
*/
typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
/**
* @brief Get the voice activity detection threshold
*
* @param model The model object to query
* @returns the detection threshold
*/
typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model);
/**
* @brief Feed samples of an audio stream to the vad model and detect whether is
* voice.
*
* @param model The model object to query
* @param samples An array of 16-bit signed audio samples. The array size used
* can be queried by the get_samp_chunksize function.
* @return The index of wake words, return 0 if no wake word is detected, else
* the index of the wake words.
*/
typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
/**
* @brief Get the triggered channel index. Channel index starts from zero
*
* @param model The model object to query
* @return The channel index
*/
typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
/**
* @brief Clean all states of model
*
* @param model The model object to query
*/
typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model);
/**
* @brief Destroy a model object
*
* @param model Model object to destroy
*/
typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model);
/**
* This structure contains the functions used to do operations on a voice
* activity detection model.
*/
typedef struct {
esp_vadn_iface_op_create_t create;
esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize;
esp_vadn_iface_op_get_channel_num_t get_channel_num;
esp_vadn_iface_op_get_samp_rate_t get_samp_rate;
esp_vadn_iface_op_set_det_threshold_t set_det_threshold;
esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
esp_vadn_iface_op_detect_t detect;
esp_vadn_iface_op_clean_t clean;
esp_vadn_iface_op_destroy_t destroy;
} esp_vadn_iface_t;
#ifdef __cplusplus
}
#endif

View File

@ -0,0 +1,22 @@
#pragma once
#include "esp_vadn_iface.h"
#ifdef __cplusplus
extern "C" {
#endif
// The prefix of vadnet model name is used to filter all wakenet from availabel models.
#define ESP_VADNET_PREFIX "vadnet"
/**
* @brief Get the wakenet handle from model name
*
* @param model_name The name of model
* @returns The handle of wakenet
*/
const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name);
#ifdef __cplusplus
}
#endif

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
lib/esp32s3/libvadnet.a Normal file

Binary file not shown.

Binary file not shown.

View File

@ -28,6 +28,8 @@ def copy_wakenet_from_sdkconfig(model_path, sdkconfig_path, target_path):
for label in f:
label = label.strip("\n")
if 'CONFIG_SR_WN' in label and '#' not in label[0]:
if '_NONE' in label:
continue
if '=' in label:
label = label.split("=")[0]
if '_MULTI' in label:

View File

@ -0,0 +1 @@
vadnet1_medium50k_Speech_5_0.849_0.573

Binary file not shown.

Binary file not shown.