diff --git a/CMakeLists.txt b/CMakeLists.txt index c405182..5f768f9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,6 +74,7 @@ elseif(${IDF_TARGET} STREQUAL "esp32s3") add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME}) + add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32s3/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME}) @@ -95,6 +96,7 @@ elseif(${IDF_TARGET} STREQUAL "esp32s3") esp_tts_chinese voice_set_xiaole nsnet + vadnet wakenet "-Wl,--end-group") @@ -153,6 +155,7 @@ elseif(${IDF_TARGET} STREQUAL "esp32p4") add_prebuilt_library(flite_g2p "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libflite_g2p.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_processor.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(wakenet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libwakenet.a" PRIV_REQUIRES ${COMPONENT_NAME}) + add_prebuilt_library(vadnet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libvadnet.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(multinet "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libmultinet.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(esp_audio_front_end "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libesp_audio_front_end.a" PRIV_REQUIRES ${COMPONENT_NAME}) add_prebuilt_library(hufzip "${CMAKE_CURRENT_SOURCE_DIR}/lib/esp32p4/libhufzip.a" PRIV_REQUIRES ${COMPONENT_NAME}) @@ -173,6 +176,7 @@ elseif(${IDF_TARGET} STREQUAL "esp32p4") esp_tts_chinese voice_set_xiaole wakenet + vadnet nsnet "-Wl,--end-group") diff --git a/include/esp32s3/esp_afe_config.h b/include/esp32s3/esp_afe_config.h index 6cac4c6..c32689d 100644 --- a/include/esp32s3/esp_afe_config.h +++ b/include/esp32s3/esp_afe_config.h @@ -92,6 +92,10 @@ typedef struct { char *afe_ns_model_name; bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone // otherwise, select channel number by wakenet + char *vad_model_name; // The model name of vad, support vadnet1 and vadnet1_small + int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms + int vad_min_noise_ms; // The minimum duration of noise/silence in ms. It should be bigger than 64 ms + bool vad_mute_playback; // If true, the playback will be muted for vad detection } afe_config_t; @@ -126,6 +130,10 @@ typedef struct { .afe_ns_mode = NS_MODE_SSP, \ .afe_ns_model_name = NULL, \ .fixed_first_channel = true, \ + .vad_model_name = NULL, \ + .vad_min_speech_ms = 64, \ + .vad_min_noise_ms = 256, \ + .vad_mute_playback = false, \ } #elif CONFIG_IDF_TARGET_ESP32P4 #define AFE_CONFIG_DEFAULT() { \ @@ -158,6 +166,10 @@ typedef struct { .afe_ns_mode = NS_MODE_SSP, \ .afe_ns_model_name = NULL, \ .fixed_first_channel = true, \ + .vad_model_name = NULL, \ + .vad_min_speech_ms = 64, \ + .vad_min_noise_ms = 256, \ + .vad_mute_playback = false, \ } #elif CONFIG_IDF_TARGET_ESP32S3 #define AFE_CONFIG_DEFAULT() { \ @@ -190,6 +202,10 @@ typedef struct { .afe_ns_mode = NS_MODE_SSP, \ .afe_ns_model_name = NULL, \ .fixed_first_channel = true, \ + .vad_model_name = NULL, \ + .vad_min_speech_ms = 64, \ + .vad_min_noise_ms = 256, \ + .vad_mute_playback = false, \ } #endif diff --git a/include/esp32s3/esp_afe_sr_iface.h b/include/esp32s3/esp_afe_sr_iface.h index 0b52ea4..84d7000 100644 --- a/include/esp32s3/esp_afe_sr_iface.h +++ b/include/esp32s3/esp_afe_sr_iface.h @@ -29,6 +29,8 @@ typedef struct afe_fetch_result_t { int16_t *data; // the data of audio. int data_size; // the size of data. The unit is byte. + int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated. + int vad_cache_size; // the size of vad_cache. The unit is byte. float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc). // if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length. wakenet_state_t wakeup_state; // the value is wakenet_state_t @@ -36,7 +38,7 @@ typedef struct afe_fetch_result_t int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1. afe_vad_state_t vad_state; // the value is afe_vad_state_t int trigger_channel_id; // the channel index of output - int wake_word_length; // the length of wake word. It's unit is the number of samples. + int wake_word_length; // the length of wake word. The unit is the number of samples. int ret_value; // the return state of fetch function void* reserved; // reserved for future use } afe_fetch_result_t; diff --git a/include/esp32s3/esp_afe_sr_models.h b/include/esp32s3/esp_afe_sr_models.h index feaad43..39de63f 100644 --- a/include/esp32s3/esp_afe_sr_models.h +++ b/include/esp32s3/esp_afe_sr_models.h @@ -4,7 +4,6 @@ extern "C" { #endif -#if defined CONFIG_USE_AFE #include "esp_afe_sr_iface.h" @@ -19,17 +18,6 @@ extern const esp_afe_sr_iface_t esp_afe_vc_v1; #endif -#else - - -#include "esp_afe_sr_iface.h" -extern const esp_afe_sr_iface_t esp_afe_sr_v1; -extern const esp_afe_sr_iface_t esp_afe_vc_v1; -#define ESP_AFE_SR_HANDLE esp_afe_sr_v1 -#define ESP_AFE_VC_HANDLE esp_afe_vc_v1 - -#endif - #ifdef __cplusplus } #endif \ No newline at end of file diff --git a/include/esp32s3/esp_vad.h b/include/esp32s3/esp_vad.h index 2440d39..90f8e20 100644 --- a/include/esp32s3/esp_vad.h +++ b/include/esp32s3/esp_vad.h @@ -25,22 +25,65 @@ extern "C" { /** * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more - * restrictive in reporting speech. + * restrictive in reporting speech. So If you want trigger more speech, please select lower mode. */ typedef enum { - VAD_MODE_0 = 0, - VAD_MODE_1, - VAD_MODE_2, - VAD_MODE_3, - VAD_MODE_4 + VAD_MODE_0 = 0, // Normal + VAD_MODE_1, // Aggressive + VAD_MODE_2, // Very Aggressive + VAD_MODE_3, // Very Very Aggressive + VAD_MODE_4 // Very Very Very Aggressive } vad_mode_t; typedef enum { VAD_SILENCE = 0, - VAD_SPEECH + VAD_SPEECH = 1, } vad_state_t; -typedef void* vad_handle_t; +typedef struct vad_trigger_tag { + vad_state_t state; + unsigned int min_speech_len; + unsigned int noise_len; + unsigned int min_noise_len; + unsigned int speech_len; +} vad_trigger_t; + +#define vad_MAX_LEN INT32_MAX - 1 +/** + * @brief Allocate wakenet trigger + * + * @param min_speech_len Minimum frame number of speech duration + * @param min_noise_len Minimum frame number of noise duration + * + * @return Trigger pointer + **/ +vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len); + +/** + * @brief Free wakenet trigger + **/ +void vad_trigger_free(vad_trigger_t *trigger); + +/** + * @brief Reset wakenet trigger + **/ +void vad_trigger_reset(vad_trigger_t *trigger); + +/** + * @brief detect activaty voice by trigger + **/ +vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state); + + +typedef struct { + vad_trigger_t *trigger; + void *vad_inst; +}vad_handle_with_trigger_t; + +typedef vad_handle_with_trigger_t* vad_handle_t; + +// typedef vad_handle_tag * vad_handle_t; + /** * @brief Creates an instance to the VAD structure. @@ -53,6 +96,18 @@ typedef void* vad_handle_t; */ vad_handle_t vad_create(vad_mode_t vad_mode); +/** + * @brief Creates an instance to the VAD structure. + * + * @param vad_mode Sets the VAD operating mode. + * @param min_speech_len Minimum frame number of speech duration + * @param min_noise_len Minimum frame number of noise duration + * @return + * - NULL: Create failed + * - Others: The instance of VAD + */ +vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len); + /** * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. * diff --git a/include/esp32s3/esp_vadn_iface.h b/include/esp32s3/esp_vadn_iface.h new file mode 100644 index 0000000..1ec8bb9 --- /dev/null +++ b/include/esp32s3/esp_vadn_iface.h @@ -0,0 +1,142 @@ +#pragma once +#include "esp_vad.h" +#include "stdint.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// Opaque model data container +typedef struct model_iface_data_t model_iface_data_t; + +// /** +// * @brief The state of vad +// */ +// typedef enum { +// VAD_NOISE = -1, // Noise +// VADNET_STATE_SILENCE = 0, // Silence +// VAD_SPEECH = 1 // Speech +// } vad_state_t; + +/** + * @brief Easy function type to initialze a model instance with a detection mode + * and specified model name + * + * @param model_name The specified model name + * @param mode The voice activity detection mode + * @param channel_num The number of input audio channels + * @param min_speech_ms The minimum duration of speech in ms to trigger vad + * speech + * @param min_noise_ms The minimum duration of noise in ms to trigger vad + * noise + * @returns Handle to the model data + */ +typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)( + const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms); + +/** + * @brief Get the amount of samples that need to be passed to the detect + * function + * + * Every speech recognition model processes a certain number of samples at the + * same time. This function can be used to query that amount. Note that the + * returned amount is in 16-bit samples, not in bytes. + * + * @param model The model object to query + * @return The amount of samples to feed the detect function + */ +typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model); + +/** + * @brief Get the channel number of samples that need to be passed to the detect + * function + * + * Every speech recognition model processes a certain number of samples at the + * same time. This function can be used to query that amount. Note that the + * returned amount is in 16-bit samples, not in bytes. + * + * @param model The model object to query + * @return The amount of samples to feed the detect function + */ +typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model); + +/** + * @brief Get the sample rate of the samples to feed to the detect function + * + * @param model The model object to query + * @return The sample rate, in hz + */ +typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model); + +/** + * @brief Set the detection threshold to manually abjust the probability + * + * @param model The model object to query + * @param det_treshold The threshold to trigger wake words, the range of + * det_threshold is 0.5~0.9999 + * @return 0: setting failed, 1: setting success + */ +typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold); + +/** + * @brief Get the voice activity detection threshold + * + * @param model The model object to query + * @returns the detection threshold + */ +typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model); + +/** + * @brief Feed samples of an audio stream to the vad model and detect whether is + * voice. + * + * @param model The model object to query + * @param samples An array of 16-bit signed audio samples. The array size used + * can be queried by the get_samp_chunksize function. + * @return The index of wake words, return 0 if no wake word is detected, else + * the index of the wake words. + */ +typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples); + +/** + * @brief Get the triggered channel index. Channel index starts from zero + * + * @param model The model object to query + * @return The channel index + */ +typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model); + +/** + * @brief Clean all states of model + * + * @param model The model object to query + */ +typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model); + +/** + * @brief Destroy a model object + * + * @param model Model object to destroy + */ +typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model); + +/** + * This structure contains the functions used to do operations on a voice + * activity detection model. + */ +typedef struct { + esp_vadn_iface_op_create_t create; + esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize; + esp_vadn_iface_op_get_channel_num_t get_channel_num; + esp_vadn_iface_op_get_samp_rate_t get_samp_rate; + esp_vadn_iface_op_set_det_threshold_t set_det_threshold; + esp_vadn_iface_op_get_det_threshold_t get_det_threshold; + esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel; + esp_vadn_iface_op_detect_t detect; + esp_vadn_iface_op_clean_t clean; + esp_vadn_iface_op_destroy_t destroy; +} esp_vadn_iface_t; + +#ifdef __cplusplus +} +#endif \ No newline at end of file diff --git a/include/esp32s3/esp_vadn_models.h b/include/esp32s3/esp_vadn_models.h new file mode 100644 index 0000000..dc9fa6f --- /dev/null +++ b/include/esp32s3/esp_vadn_models.h @@ -0,0 +1,22 @@ +#pragma once +#include "esp_vadn_iface.h" + +#ifdef __cplusplus +extern "C" { +#endif + +// The prefix of vadnet model name is used to filter all wakenet from availabel models. +#define ESP_VADNET_PREFIX "vadnet" + +/** + * @brief Get the wakenet handle from model name + * + * @param model_name The name of model + * @returns The handle of wakenet + */ +const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name); + + +#ifdef __cplusplus +} +#endif diff --git a/lib/esp32s3/libc_speech_features.a b/lib/esp32s3/libc_speech_features.a index 1cd372e..108af2e 100644 Binary files a/lib/esp32s3/libc_speech_features.a and b/lib/esp32s3/libc_speech_features.a differ diff --git a/lib/esp32s3/libdl_lib.a b/lib/esp32s3/libdl_lib.a index 21626fa..d81fea5 100644 Binary files a/lib/esp32s3/libdl_lib.a and b/lib/esp32s3/libdl_lib.a differ diff --git a/lib/esp32s3/libesp_audio_front_end.a b/lib/esp32s3/libesp_audio_front_end.a index ed917cb..4eb41c2 100644 Binary files a/lib/esp32s3/libesp_audio_front_end.a and b/lib/esp32s3/libesp_audio_front_end.a differ diff --git a/lib/esp32s3/libesp_audio_processor.a b/lib/esp32s3/libesp_audio_processor.a index 8e8db4e..a444b22 100644 Binary files a/lib/esp32s3/libesp_audio_processor.a and b/lib/esp32s3/libesp_audio_processor.a differ diff --git a/lib/esp32s3/libflite_g2p.a b/lib/esp32s3/libflite_g2p.a index 76538e2..6a99a57 100644 Binary files a/lib/esp32s3/libflite_g2p.a and b/lib/esp32s3/libflite_g2p.a differ diff --git a/lib/esp32s3/libfst.a b/lib/esp32s3/libfst.a index 086a928..a2dd373 100644 Binary files a/lib/esp32s3/libfst.a and b/lib/esp32s3/libfst.a differ diff --git a/lib/esp32s3/libhufzip.a b/lib/esp32s3/libhufzip.a index b790f14..c0465b1 100644 Binary files a/lib/esp32s3/libhufzip.a and b/lib/esp32s3/libhufzip.a differ diff --git a/lib/esp32s3/libmultinet.a b/lib/esp32s3/libmultinet.a index b7418f8..8e3c835 100644 Binary files a/lib/esp32s3/libmultinet.a and b/lib/esp32s3/libmultinet.a differ diff --git a/lib/esp32s3/libnsnet.a b/lib/esp32s3/libnsnet.a index 3b00050..aaec72e 100644 Binary files a/lib/esp32s3/libnsnet.a and b/lib/esp32s3/libnsnet.a differ diff --git a/lib/esp32s3/libvadnet.a b/lib/esp32s3/libvadnet.a new file mode 100644 index 0000000..bfb4ebe Binary files /dev/null and b/lib/esp32s3/libvadnet.a differ diff --git a/lib/esp32s3/libwakenet.a b/lib/esp32s3/libwakenet.a index 82c5c27..9a4fd3d 100644 Binary files a/lib/esp32s3/libwakenet.a and b/lib/esp32s3/libwakenet.a differ diff --git a/model/movemodel.py b/model/movemodel.py index b49aa8a..8187cdc 100644 --- a/model/movemodel.py +++ b/model/movemodel.py @@ -28,6 +28,8 @@ def copy_wakenet_from_sdkconfig(model_path, sdkconfig_path, target_path): for label in f: label = label.strip("\n") if 'CONFIG_SR_WN' in label and '#' not in label[0]: + if '_NONE' in label: + continue if '=' in label: label = label.split("=")[0] if '_MULTI' in label: diff --git a/model/vadnet_model/vadnet1_medium/_MODEL_INFO_ b/model/vadnet_model/vadnet1_medium/_MODEL_INFO_ new file mode 100644 index 0000000..549aef2 --- /dev/null +++ b/model/vadnet_model/vadnet1_medium/_MODEL_INFO_ @@ -0,0 +1 @@ +vadnet1_medium50k_Speech_5_0.849_0.573 diff --git a/model/vadnet_model/vadnet1_medium/vadn1_data b/model/vadnet_model/vadnet1_medium/vadn1_data new file mode 100644 index 0000000..ccec28c Binary files /dev/null and b/model/vadnet_model/vadnet1_medium/vadn1_data differ diff --git a/model/vadnet_model/vadnet1_medium/vadn1_index b/model/vadnet_model/vadnet1_medium/vadn1_index new file mode 100644 index 0000000..7e22357 Binary files /dev/null and b/model/vadnet_model/vadnet1_medium/vadn1_index differ