mirror of
https://github.com/espressif/esp-sr.git
synced 2025-09-15 15:28:44 +08:00
feat(esp32p4): add vadnet1
This commit is contained in:
parent
404fa46e38
commit
d72ed551cb
@ -13,14 +13,9 @@ choice MODEL_DATA_PATH
|
||||
endchoice
|
||||
|
||||
|
||||
config USE_AFE
|
||||
bool "use afe"
|
||||
default "y"
|
||||
|
||||
choice AFE_INTERFACE_SEL
|
||||
prompt "Afe interface"
|
||||
default AFE_INTERFACE_V1
|
||||
depends on USE_AFE
|
||||
help
|
||||
Select the afe interface to be used.
|
||||
|
||||
@ -29,187 +24,60 @@ choice AFE_INTERFACE_SEL
|
||||
|
||||
endchoice
|
||||
|
||||
config USE_NSNET
|
||||
bool "use nsnet"
|
||||
default "n"
|
||||
|
||||
choice SR_NSN_MODEL_LOAD
|
||||
prompt "Select deep noise suppression"
|
||||
default SR_NSN_NSNET2
|
||||
depends on USE_NSNET
|
||||
prompt "Select noise suppression model"
|
||||
default SR_NSN_WEBRTC
|
||||
help
|
||||
Select the deep noise suppression to be loaded.
|
||||
Select the noise suppression model to be loaded.
|
||||
|
||||
config SR_NSN_NONE
|
||||
bool "None"
|
||||
config SR_NSN_WEBRTC
|
||||
bool "noise suppression (WebRTC)"
|
||||
|
||||
config SR_NSN_NSNET1
|
||||
bool "Deep noise suppression v1 (nsnet1)"
|
||||
depends on IDF_TARGET_ESP32S3
|
||||
config SR_NSN_NSNET2
|
||||
bool "Deep noise suppression v2 (nsnet2)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
endchoice
|
||||
|
||||
config USE_WAKENET
|
||||
bool "use wakenet"
|
||||
default "y"
|
||||
choice SR_VAD_MODEL_LOAD
|
||||
prompt "Select voice activity detection"
|
||||
default SR_VADNET_WEBRTC
|
||||
help
|
||||
Select the vad model to be loaded.
|
||||
|
||||
config SR_VAD_WEBRTC
|
||||
bool "voice activity detection (WebRTC)"
|
||||
|
||||
config SR_VADNET1_MODLE_MEDIUM
|
||||
bool "voice activity detection (vadnet1 medium)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
endchoice
|
||||
|
||||
choice SR_WN_MODEL_LOAD
|
||||
prompt "Select wake words"
|
||||
default SR_WN_WN9_HILEXIN
|
||||
depends on USE_WAKENET
|
||||
default SR_WN_WN5_HILEXIN
|
||||
depends on IDF_TARGET_ESP32
|
||||
help
|
||||
Select the Wake Words to be loaded.
|
||||
|
||||
config SR_WN_WN5_HILEXIN
|
||||
bool "Hi,乐鑫 (wn5_hilexin)"
|
||||
depends on IDF_TARGET_ESP32
|
||||
bool "Hi,Lexin (wn5_hilexin)"
|
||||
|
||||
config SR_WN_WN5X3_HILEXIN
|
||||
bool "Hi,乐鑫 (wn5_hilexinX3)"
|
||||
depends on IDF_TARGET_ESP32
|
||||
bool "Hi,Lexin (wn5_hilexinX3)"
|
||||
|
||||
config SR_WN_WN5_NIHAOXIAOZHI
|
||||
bool "你好小智 (wn5_nihaoxiaozhi)"
|
||||
depends on IDF_TARGET_ESP32
|
||||
bool "nihaoxiaozhi (wn5_nihaoxiaozhi)"
|
||||
|
||||
config SR_WN_WN5X3_NIHAOXIAOZHI
|
||||
bool "你好小智 (wn5_nihaoxiaozhiX3)"
|
||||
depends on IDF_TARGET_ESP32
|
||||
bool "nihaoxiaozhi (wn5_nihaoxiaozhiX3)"
|
||||
|
||||
config SR_WN_WN5X3_NIHAOXIAOXIN
|
||||
bool "你好小鑫 (wn5_nihaoxiaoxinX3)"
|
||||
depends on IDF_TARGET_ESP32
|
||||
|
||||
config SR_WN_WN8_ALEXA
|
||||
bool "Alexa (wn8_alexa)"
|
||||
depends on IDF_TARGET_ESP32S3
|
||||
|
||||
config SR_WN_WN9_HILEXIN
|
||||
bool "Hi,乐鑫 (wn9_hilexin)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_XIAOAITONGXUE
|
||||
bool "小爱同学 (wn9_xiaoaitongxue)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_ALEXA
|
||||
bool "Alexa (wn9_alexa)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_HIESP
|
||||
bool "Hi,ESP (wn9_hiesp)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_HIMFIVE
|
||||
bool "Hi,M Five (wn9_himfive)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_NIHAOXIAOZHI_TTS
|
||||
bool "你好小智 (wn9_nihaoxiaozhi_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_JARVIS_TTS
|
||||
bool "Jarvis (wn9_jarvis_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_COMPUTER_TTS
|
||||
bool "computer (wn9_computer_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_HEYWILLOW_TTS
|
||||
bool "Hey,Willow (wn9_heywillow_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_SOPHIA_TTS
|
||||
bool "Sophia (wn9_sophia_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_NIHAOXIAOXIN_TTS
|
||||
bool "你好小鑫 (wn9_nihaoxiaoxin_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_XIAOMEITONGXUE_TTS
|
||||
bool "小美同学 (wn9_xiaomeitongxue_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_HIXIAOXING_TTS
|
||||
bool "Hi,小星 (wn9_hixiaoxing_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_MYCROFT_TTS
|
||||
bool "Mycroft (wn9_mycroft_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_HEYPRINTER_TTS
|
||||
bool "Hey,Printer (wn9_heyprinter_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_XIAOLONGXIAOLONG_TTS
|
||||
bool "小龙小龙 (wn9_xiaolongxiaolong_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_MIAOMIAOTONGXUE_TTS
|
||||
bool "喵喵同学 (wn9_miaomiaotongxue_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_HIJOY_TTS
|
||||
bool "Hi,Joy (wn9_hijoy_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_HILILI_TTS
|
||||
bool "Hi,Lily/Hi,莉莉 (wn9_hilili_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_HITELLY_TTS
|
||||
bool "Hi,Telly/Hi,泰力 (wn9_hitelly_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_HEYWANDA_TTS
|
||||
bool "Hey,Wanda (wn9_heywanda_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_HIMIAOMIAO_TTS
|
||||
bool "Hi,喵喵 (wn9_himiaomiao_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_XIAOBINXIAOBIN_TTS
|
||||
bool "小滨小滨/小冰小冰 (wn9_xiaobinxiaobin_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_HAIXIAOWU_TTS
|
||||
bool "Hi,小巫 (wn9_haixiaowu_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_ASTROLABE_TTS
|
||||
bool "Astrolabe (wn9_astrolabe_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_XIAOYAXIAOYA_TTS2
|
||||
bool "小鸭小鸭 (wn9_xiaoyaxiaoya_tts2)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_HIJASON_TTS2
|
||||
bool "Hi,Jason (wn9_hijason_tts2)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_LINAIBAN_TTS2
|
||||
bool "璃奈板 (wn9_linaiban_tts2)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_CUSTOMWORD
|
||||
bool "customized word (wn9_customword)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_LOAD_MULIT_WORD
|
||||
bool "Load Multiple Wake Words"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
bool "nihaoxiaoxin (wn5_nihaoxiaoxinX3)"
|
||||
|
||||
endchoice
|
||||
|
||||
menu "Load Multiple Wake Words"
|
||||
depends on SR_WN_LOAD_MULIT_WORD
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_HILEXIN_MULTI
|
||||
bool "Hi,乐鑫 (wn9_hilexin)"
|
||||
@ -241,94 +109,90 @@ menu "Load Multiple Wake Words"
|
||||
|
||||
config SR_WN_WN9_HEYWILLOW_TTS_MULTI
|
||||
bool "Hey,Willow (wn9_heywillow_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_SOPHIA_TTS_MULTI
|
||||
bool "Sophia (wn9_sophia_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_NIHAOXIAOXIN_TTS_MULTI
|
||||
bool "你好小鑫 (wn9_nihaoxiaoxin_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_XIAOMEITONGXUE_TTS_MULTI
|
||||
bool "小美同学 (wn9_xiaomeitongxue_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_HEYPRINTER_TTS_MULTI
|
||||
bool "Hey,Printer (wn9_heyprinter_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_XIAOLONGXIAOLONG_TTS_MULTI
|
||||
bool "小龙小龙 (wn9_xiaolongxiaolong_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_MIAOMIAOTONGXUE_TTS_MULTI
|
||||
bool "喵喵同学 (wn9_miaomiaotongxue_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
|
||||
config SR_WN_WN9_HEYWANDA_TTS_MULTI
|
||||
bool "Hey,Wanda (wn9_heywanda_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_HIMIAOMIAO_TTS_MULTI
|
||||
bool "Hi,喵喵 (wn9_himiaomiao_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
|
||||
config SR_WN_WN9_MYCROFT_TTS_MULTI
|
||||
bool "Mycroft (wn9_mycroft_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_HIJOY_TTS_MULTI
|
||||
bool "Hi,Joy (wn9_hijoy_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_HILILI_TTS_MULTI
|
||||
bool "Hi,Lily/Hi,莉莉 (wn9_hilili_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_HITELLY_TTS_MULTI
|
||||
bool "Hi,Telly/Hi,泰力 (wn9_hitelly_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_XIAOBINXIAOBIN_TTS_MULTI
|
||||
bool "小滨小滨/小冰小冰 (wn9_xiaobinxiaobin_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_HAIXIAOWU_TTS_MULTI
|
||||
bool "Hi,小巫 (wn9_haixiaowu_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_ASTROLABE_TTS_MULTI
|
||||
bool "Astrolabe (wn9_astrolabe_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_XIAOYAXIAOYA_TTS2_MULTI
|
||||
bool "小鸭小鸭 (wn9_xiaoyaxiaoya_tts2)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_HIJASON_TTS2_MULTI
|
||||
bool "Hi,Jason (wn9_hijason_tts2)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
config SR_WN_WN9_LINAIBAN_TTS2_MULTI
|
||||
bool "璃奈板 (wn9_linaiban_tts2)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
default False
|
||||
|
||||
endmenu
|
||||
|
||||
config USE_MULTINET
|
||||
bool "use multinet"
|
||||
default "y"
|
||||
|
||||
choice CHINESE_SR_MN_MODEL_SEL
|
||||
prompt "Chinese Speech Commands Model"
|
||||
default SR_MN_CN_MULTINET6_QUANT
|
||||
depends on USE_MULTINET
|
||||
default SR_MN_CN_NONE
|
||||
help
|
||||
Select the Wake Word Engine to be used.
|
||||
Select the Chinese Speech Commands Model.
|
||||
|
||||
config SR_MN_CN_NONE
|
||||
bool "None"
|
||||
@ -362,9 +226,8 @@ endchoice
|
||||
choice ENGLISH_SR_MN_MODEL_SEL
|
||||
prompt "English Speech Commands Model"
|
||||
default SR_MN_EN_NONE
|
||||
depends on USE_MULTINET
|
||||
help
|
||||
Select the Wake Word Engine to be used.
|
||||
Select the English Speech Commands Model.
|
||||
|
||||
config SR_MN_EN_NONE
|
||||
bool "None"
|
||||
|
||||
@ -92,6 +92,10 @@ typedef struct {
|
||||
char *afe_ns_model_name;
|
||||
bool fixed_first_channel; // If true, the channel after first wake-up is fixed to raw data of microphone
|
||||
// otherwise, select channel number by wakenet
|
||||
char *vad_model_name; // The model name of vad, support vadnet1 and vadnet1_small
|
||||
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms
|
||||
int vad_min_noise_ms; // The minimum duration of noise/silence in ms. It should be bigger than 64 ms
|
||||
bool vad_mute_playback; // If true, the playback will be muted for vad detection
|
||||
} afe_config_t;
|
||||
|
||||
|
||||
@ -126,6 +130,10 @@ typedef struct {
|
||||
.afe_ns_mode = NS_MODE_SSP, \
|
||||
.afe_ns_model_name = NULL, \
|
||||
.fixed_first_channel = true, \
|
||||
.vad_model_name = NULL, \
|
||||
.vad_min_speech_ms = 64, \
|
||||
.vad_min_noise_ms = 256, \
|
||||
.vad_mute_playback = false, \
|
||||
}
|
||||
#elif CONFIG_IDF_TARGET_ESP32P4
|
||||
#define AFE_CONFIG_DEFAULT() { \
|
||||
@ -158,6 +166,10 @@ typedef struct {
|
||||
.afe_ns_mode = NS_MODE_SSP, \
|
||||
.afe_ns_model_name = NULL, \
|
||||
.fixed_first_channel = true, \
|
||||
.vad_model_name = NULL, \
|
||||
.vad_min_speech_ms = 64, \
|
||||
.vad_min_noise_ms = 256, \
|
||||
.vad_mute_playback = false, \
|
||||
}
|
||||
#elif CONFIG_IDF_TARGET_ESP32S3
|
||||
#define AFE_CONFIG_DEFAULT() { \
|
||||
@ -190,6 +202,10 @@ typedef struct {
|
||||
.afe_ns_mode = NS_MODE_SSP, \
|
||||
.afe_ns_model_name = NULL, \
|
||||
.fixed_first_channel = true, \
|
||||
.vad_model_name = NULL, \
|
||||
.vad_min_speech_ms = 64, \
|
||||
.vad_min_noise_ms = 256, \
|
||||
.vad_mute_playback = false, \
|
||||
}
|
||||
#endif
|
||||
|
||||
|
||||
@ -29,6 +29,8 @@ typedef struct afe_fetch_result_t
|
||||
{
|
||||
int16_t *data; // the data of audio.
|
||||
int data_size; // the size of data. The unit is byte.
|
||||
int16_t *vad_cache; // the cache data of vad. It's only valid when vad_cache_size > 0. It is used to complete the audio that was truncated.
|
||||
int vad_cache_size; // the size of vad_cache. The unit is byte.
|
||||
float data_volume; // the volume of input audio, the unit is decibel(dB). This value is calculated before agc. (note: invalid in vc).
|
||||
// if enable wakenet, the window length is the receptive fields of wakenet(about 1.5s), otherwise is the frame length.
|
||||
wakenet_state_t wakeup_state; // the value is wakenet_state_t
|
||||
@ -36,7 +38,7 @@ typedef struct afe_fetch_result_t
|
||||
int wakenet_model_index; // if there are multiple wakenets, this value identifies which model be wakes up. Index start from 1.
|
||||
afe_vad_state_t vad_state; // the value is afe_vad_state_t
|
||||
int trigger_channel_id; // the channel index of output
|
||||
int wake_word_length; // the length of wake word. It's unit is the number of samples.
|
||||
int wake_word_length; // the length of wake word. The unit is the number of samples.
|
||||
int ret_value; // the return state of fetch function
|
||||
void* reserved; // reserved for future use
|
||||
} afe_fetch_result_t;
|
||||
|
||||
@ -4,7 +4,6 @@
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
#if defined CONFIG_USE_AFE
|
||||
#include "esp_afe_sr_iface.h"
|
||||
|
||||
|
||||
@ -19,17 +18,6 @@ extern const esp_afe_sr_iface_t esp_afe_vc_v1;
|
||||
#endif
|
||||
|
||||
|
||||
#else
|
||||
|
||||
|
||||
#include "esp_afe_sr_iface.h"
|
||||
extern const esp_afe_sr_iface_t esp_afe_sr_v1;
|
||||
extern const esp_afe_sr_iface_t esp_afe_vc_v1;
|
||||
#define ESP_AFE_SR_HANDLE esp_afe_sr_v1
|
||||
#define ESP_AFE_VC_HANDLE esp_afe_vc_v1
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
@ -25,22 +25,65 @@ extern "C" {
|
||||
|
||||
/**
|
||||
* @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
|
||||
* restrictive in reporting speech.
|
||||
* restrictive in reporting speech. So If you want trigger more speech, please select lower mode.
|
||||
*/
|
||||
typedef enum {
|
||||
VAD_MODE_0 = 0,
|
||||
VAD_MODE_1,
|
||||
VAD_MODE_2,
|
||||
VAD_MODE_3,
|
||||
VAD_MODE_4
|
||||
VAD_MODE_0 = 0, // Normal
|
||||
VAD_MODE_1, // Aggressive
|
||||
VAD_MODE_2, // Very Aggressive
|
||||
VAD_MODE_3, // Very Very Aggressive
|
||||
VAD_MODE_4 // Very Very Very Aggressive
|
||||
} vad_mode_t;
|
||||
|
||||
typedef enum {
|
||||
VAD_SILENCE = 0,
|
||||
VAD_SPEECH
|
||||
VAD_SPEECH = 1,
|
||||
} vad_state_t;
|
||||
|
||||
typedef void* vad_handle_t;
|
||||
typedef struct vad_trigger_tag {
|
||||
vad_state_t state;
|
||||
unsigned int min_speech_len;
|
||||
unsigned int noise_len;
|
||||
unsigned int min_noise_len;
|
||||
unsigned int speech_len;
|
||||
} vad_trigger_t;
|
||||
|
||||
#define vad_MAX_LEN INT32_MAX - 1
|
||||
/**
|
||||
* @brief Allocate wakenet trigger
|
||||
*
|
||||
* @param min_speech_len Minimum frame number of speech duration
|
||||
* @param min_noise_len Minimum frame number of noise duration
|
||||
*
|
||||
* @return Trigger pointer
|
||||
**/
|
||||
vad_trigger_t *vad_trigger_alloc(int min_speech_len, int min_noise_len);
|
||||
|
||||
/**
|
||||
* @brief Free wakenet trigger
|
||||
**/
|
||||
void vad_trigger_free(vad_trigger_t *trigger);
|
||||
|
||||
/**
|
||||
* @brief Reset wakenet trigger
|
||||
**/
|
||||
void vad_trigger_reset(vad_trigger_t *trigger);
|
||||
|
||||
/**
|
||||
* @brief detect activaty voice by trigger
|
||||
**/
|
||||
vad_state_t vad_trigger_detect(vad_trigger_t *trigger, vad_state_t state);
|
||||
|
||||
|
||||
typedef struct {
|
||||
vad_trigger_t *trigger;
|
||||
void *vad_inst;
|
||||
}vad_handle_with_trigger_t;
|
||||
|
||||
typedef vad_handle_with_trigger_t* vad_handle_t;
|
||||
|
||||
// typedef vad_handle_tag * vad_handle_t;
|
||||
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the VAD structure.
|
||||
@ -53,6 +96,18 @@ typedef void* vad_handle_t;
|
||||
*/
|
||||
vad_handle_t vad_create(vad_mode_t vad_mode);
|
||||
|
||||
/**
|
||||
* @brief Creates an instance to the VAD structure.
|
||||
*
|
||||
* @param vad_mode Sets the VAD operating mode.
|
||||
* @param min_speech_len Minimum frame number of speech duration
|
||||
* @param min_noise_len Minimum frame number of noise duration
|
||||
* @return
|
||||
* - NULL: Create failed
|
||||
* - Others: The instance of VAD
|
||||
*/
|
||||
vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int min_speech_len, int min_noise_len);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
|
||||
*
|
||||
|
||||
142
include/esp32p4/esp_vadn_iface.h
Normal file
142
include/esp32p4/esp_vadn_iface.h
Normal file
@ -0,0 +1,142 @@
|
||||
#pragma once
|
||||
#include "esp_vad.h"
|
||||
#include "stdint.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// Opaque model data container
|
||||
typedef struct model_iface_data_t model_iface_data_t;
|
||||
|
||||
// /**
|
||||
// * @brief The state of vad
|
||||
// */
|
||||
// typedef enum {
|
||||
// VAD_NOISE = -1, // Noise
|
||||
// VADNET_STATE_SILENCE = 0, // Silence
|
||||
// VAD_SPEECH = 1 // Speech
|
||||
// } vad_state_t;
|
||||
|
||||
/**
|
||||
* @brief Easy function type to initialze a model instance with a detection mode
|
||||
* and specified model name
|
||||
*
|
||||
* @param model_name The specified model name
|
||||
* @param mode The voice activity detection mode
|
||||
* @param channel_num The number of input audio channels
|
||||
* @param min_speech_ms The minimum duration of speech in ms to trigger vad
|
||||
* speech
|
||||
* @param min_noise_ms The minimum duration of noise in ms to trigger vad
|
||||
* noise
|
||||
* @returns Handle to the model data
|
||||
*/
|
||||
typedef model_iface_data_t *(*esp_vadn_iface_op_create_t)(
|
||||
const void *model_name, vad_mode_t mode, int channel_num, int min_speech_ms, int min_noise_ms);
|
||||
|
||||
/**
|
||||
* @brief Get the amount of samples that need to be passed to the detect
|
||||
* function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the
|
||||
* same time. This function can be used to query that amount. Note that the
|
||||
* returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the channel number of samples that need to be passed to the detect
|
||||
* function
|
||||
*
|
||||
* Every speech recognition model processes a certain number of samples at the
|
||||
* same time. This function can be used to query that amount. Note that the
|
||||
* returned amount is in 16-bit samples, not in bytes.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The amount of samples to feed the detect function
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_channel_num_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Get the sample rate of the samples to feed to the detect function
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The sample rate, in hz
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Set the detection threshold to manually abjust the probability
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param det_treshold The threshold to trigger wake words, the range of
|
||||
* det_threshold is 0.5~0.9999
|
||||
* @return 0: setting failed, 1: setting success
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
|
||||
|
||||
/**
|
||||
* @brief Get the voice activity detection threshold
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @returns the detection threshold
|
||||
*/
|
||||
typedef float (*esp_vadn_iface_op_get_det_threshold_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Feed samples of an audio stream to the vad model and detect whether is
|
||||
* voice.
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @param samples An array of 16-bit signed audio samples. The array size used
|
||||
* can be queried by the get_samp_chunksize function.
|
||||
* @return The index of wake words, return 0 if no wake word is detected, else
|
||||
* the index of the wake words.
|
||||
*/
|
||||
typedef vad_state_t (*esp_vadn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
|
||||
|
||||
/**
|
||||
* @brief Get the triggered channel index. Channel index starts from zero
|
||||
*
|
||||
* @param model The model object to query
|
||||
* @return The channel index
|
||||
*/
|
||||
typedef int (*esp_vadn_iface_op_get_triggered_channel_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Clean all states of model
|
||||
*
|
||||
* @param model The model object to query
|
||||
*/
|
||||
typedef void (*esp_vadn_iface_op_clean_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* @brief Destroy a model object
|
||||
*
|
||||
* @param model Model object to destroy
|
||||
*/
|
||||
typedef void (*esp_vadn_iface_op_destroy_t)(model_iface_data_t *model);
|
||||
|
||||
/**
|
||||
* This structure contains the functions used to do operations on a voice
|
||||
* activity detection model.
|
||||
*/
|
||||
typedef struct {
|
||||
esp_vadn_iface_op_create_t create;
|
||||
esp_vadn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
||||
esp_vadn_iface_op_get_channel_num_t get_channel_num;
|
||||
esp_vadn_iface_op_get_samp_rate_t get_samp_rate;
|
||||
esp_vadn_iface_op_set_det_threshold_t set_det_threshold;
|
||||
esp_vadn_iface_op_get_det_threshold_t get_det_threshold;
|
||||
esp_vadn_iface_op_get_triggered_channel_t get_triggered_channel;
|
||||
esp_vadn_iface_op_detect_t detect;
|
||||
esp_vadn_iface_op_clean_t clean;
|
||||
esp_vadn_iface_op_destroy_t destroy;
|
||||
} esp_vadn_iface_t;
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
22
include/esp32p4/esp_vadn_models.h
Normal file
22
include/esp32p4/esp_vadn_models.h
Normal file
@ -0,0 +1,22 @@
|
||||
#pragma once
|
||||
#include "esp_vadn_iface.h"
|
||||
|
||||
#ifdef __cplusplus
|
||||
extern "C" {
|
||||
#endif
|
||||
|
||||
// The prefix of vadnet model name is used to filter all wakenet from availabel models.
|
||||
#define ESP_VADNET_PREFIX "vadnet"
|
||||
|
||||
/**
|
||||
* @brief Get the wakenet handle from model name
|
||||
*
|
||||
* @param model_name The name of model
|
||||
* @returns The handle of wakenet
|
||||
*/
|
||||
const esp_vadn_iface_t *esp_vadn_handle_from_name(const char *model_name);
|
||||
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
#endif
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
BIN
lib/esp32p4/libvadnet.a
Normal file
BIN
lib/esp32p4/libvadnet.a
Normal file
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@ -1 +1 @@
|
||||
vadnet1_medium50k_Speech_5_0.849_0.573
|
||||
vadnet1_50k_Speech_3_0.5_0.1
|
||||
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue
Block a user