feat: add vad reset func

This commit is contained in:
xysun 2025-02-07 17:37:18 +08:00
parent d7a5bbec47
commit 2980234ae1
19 changed files with 56 additions and 27 deletions

View File

@ -54,15 +54,17 @@ Input Format Definition
The ``input_format`` parameter specifies the arrangement of audio channels in the input data. Each character in the string represents a channel type: The ``input_format`` parameter specifies the arrangement of audio channels in the input data. Each character in the string represents a channel type:
+-----------+-------------+ +-----------+---------------------+
| Character | Description | | Character | Description |
+===========+=============+ +===========+=====================+
| ``M`` | Microphone channel | | ``M`` | Microphone channel |
+-----------+-------------+ +-----------+---------------------+
| ``R`` | Playback reference channel | | ``R`` | Playback reference |
+-----------+-------------+ | | channel |
| ``N`` | Unused or unknown channel | +-----------+---------------------+
+-----------+-------------+ | ``N`` | Unused or unknown |
| | channel |
+-----------+---------------------+
**Example:** **Example:**
- ``"MMNR"``: Indicates four channels: two microphone channels, one unused channel, and one playback reference channel. - ``"MMNR"``: Indicates four channels: two microphone channels, one unused channel, and one playback reference channel.
@ -73,6 +75,9 @@ The ``input_format`` parameter specifies the arrangement of audio channels in th
Using the AFE Framework Using the AFE Framework
---------------------------- ----------------------------
Based on the ``menuconfig`` -> ``ESP Speech Recognition``, select the required AFE (Analog Front End) models, such as the WakeNet model, VAD (Voice Activity Detection) model, NS (Noise Suppression) model, etc., and then call the AFE framework in the code using the following steps.
For reference, you can check the code in :project_file:`test_apps/esp-sr/main/test_afe.cpp`.
Step 1: Initialize AFE Configuration Step 1: Initialize AFE Configuration
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -80,6 +85,7 @@ Get the default configuration using ``afe_config_init()`` and customize paramete
.. code-block:: c .. code-block:: c
srmodel_list_t *models = esp_srmodel_init("model");
afe_config_t *afe_config = afe_config_init("MMNR", models, AFE_TYPE_SR, AFE_MODE_HIGH_PERF); afe_config_t *afe_config = afe_config_init("MMNR", models, AFE_TYPE_SR, AFE_MODE_HIGH_PERF);
- **``input_format``**: Define the channel arrangement (e.g., ``"MMNR"``). - **``input_format``**: Define the channel arrangement (e.g., ``"MMNR"``).

View File

@ -53,15 +53,15 @@ AFE 声学前端算法框架
``input_format`` 参数定义了输入数据中音频通道的排列方式。字符串中的每个字符代表一个通道类型: ``input_format`` 参数定义了输入数据中音频通道的排列方式。字符串中的每个字符代表一个通道类型:
+-----------+-------------+ +-----------+---------------------+
| 字符 | 描述 | | 字符 | 描述 |
+===========+=============+ +===========+=====================+
| ``M`` | 麦克风通道 | | ``M`` | 麦克风通道 |
+-----------+-------------+ +-----------+---------------------+
| ``R`` | 播放参考通道 | | ``R`` | 播放参考通道 |
+-----------+-------------+ +-----------+---------------------+
| ``N`` | 未使用或未知通道 | | ``N`` | 未使用或未知通道 |
+-----------+-------------+ +-----------+---------------------+
**示例:** **示例:**
- ``"MMNR"``:表示四通道排列,包含两个麦克风通道、一个未使用通道和一个播放参考通道。 - ``"MMNR"``:表示四通道排列,包含两个麦克风通道、一个未使用通道和一个播放参考通道。
@ -71,6 +71,8 @@ AFE 声学前端算法框架
使用AFE框架 使用AFE框架
---------------------------- ----------------------------
根据 ``menuconfig`` -> ``ESP Speech Recognition`` 选择需要的AFE的模型比如WakeNet模型VAD模型 NS模型等然后在代码中使用以下步骤调用AFE框架。
代码可以参考 :project_file:`test_apps/esp-sr/main/test_afe.cpp`
步骤1初始化AFE配置 步骤1初始化AFE配置
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -79,6 +81,7 @@ AFE 声学前端算法框架
.. code-block:: c .. code-block:: c
srmodel_list_t *models = esp_srmodel_init("model");
afe_config_t *afe_config = afe_config_init("MMNR", models, AFE_TYPE_SR, AFE_MODE_HIGH_PERF); afe_config_t *afe_config = afe_config_init("MMNR", models, AFE_TYPE_SR, AFE_MODE_HIGH_PERF);
- **``input_format``**:定义通道排列(如 ``"MMNR"``)。 - **``input_format``**:定义通道排列(如 ``"MMNR"``)。

View File

@ -110,6 +110,8 @@ typedef struct {
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
// 1000 ms // 1000 ms
int vad_delay_ms; // The delay of the first speech frame in ms, default: 128 ms
// If you find vad cache can not cover all speech, please increase this value.
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false

View File

@ -141,12 +141,12 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name); typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name);
/** /**
* @brief Enable VAD algorithm. * @brief Reset one function/module/algorithm.
* *
* @param afe The AFE_SR object to query * @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled * @return -1: fail, 1: success
*/ */
typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe); typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe);
/** /**
* @brief Disable one function/module/algorithm. * @brief Disable one function/module/algorithm.
@ -204,6 +204,7 @@ typedef struct {
esp_afe_sr_iface_op_enable_func_t enable_se; esp_afe_sr_iface_op_enable_func_t enable_se;
esp_afe_sr_iface_op_disable_func_t disable_vad; esp_afe_sr_iface_op_disable_func_t disable_vad;
esp_afe_sr_iface_op_enable_func_t enable_vad; esp_afe_sr_iface_op_enable_func_t enable_vad;
esp_afe_sr_iface_op_reset_op_t reset_vad;
esp_afe_sr_iface_op_disable_func_t disable_ns; esp_afe_sr_iface_op_disable_func_t disable_ns;
esp_afe_sr_iface_op_enable_func_t enable_ns; esp_afe_sr_iface_op_enable_func_t enable_ns;
esp_afe_sr_iface_op_disable_func_t disable_agc; esp_afe_sr_iface_op_disable_func_t disable_agc;

View File

@ -110,7 +110,7 @@ vad_handle_t vad_create(vad_mode_t vad_mode);
* - NULL: Create failed * - NULL: Create failed
* - Others: The instance of VAD * - Others: The instance of VAD
*/ */
vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len); vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
/** /**
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
@ -138,6 +138,13 @@ vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz,
*/ */
vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data); vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
/**
* @brief Reset trigger state as Silence
*
* @param handle The instance of VAD.
*/
void vad_reset_trigger(vad_handle_t handle);
/** /**
* @brief Free the VAD instance * @brief Free the VAD instance
* *

View File

@ -110,6 +110,8 @@ typedef struct {
int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms
int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default:
// 1000 ms // 1000 ms
int vad_delay_ms; // The delay of the first speech frame in ms, default: 128 ms
// If you find vad cache can not cover all speech, please increase this value.
bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false
bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false

View File

@ -141,12 +141,12 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe);
typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name); typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name);
/** /**
* @brief Enable VAD algorithm. * @brief Reset one function/module/algorithm.
* *
* @param afe The AFE_SR object to query * @param afe The AFE_SR object to query
* @return -1: fail, 0: disabled, 1: enabled * @return -1: fail, 1: success
*/ */
typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe); typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe);
/** /**
* @brief Disable one function/module/algorithm. * @brief Disable one function/module/algorithm.
@ -204,6 +204,7 @@ typedef struct {
esp_afe_sr_iface_op_enable_func_t enable_se; esp_afe_sr_iface_op_enable_func_t enable_se;
esp_afe_sr_iface_op_disable_func_t disable_vad; esp_afe_sr_iface_op_disable_func_t disable_vad;
esp_afe_sr_iface_op_enable_func_t enable_vad; esp_afe_sr_iface_op_enable_func_t enable_vad;
esp_afe_sr_iface_op_reset_op_t reset_vad;
esp_afe_sr_iface_op_disable_func_t disable_ns; esp_afe_sr_iface_op_disable_func_t disable_ns;
esp_afe_sr_iface_op_enable_func_t enable_ns; esp_afe_sr_iface_op_enable_func_t enable_ns;
esp_afe_sr_iface_op_disable_func_t disable_agc; esp_afe_sr_iface_op_disable_func_t disable_agc;

View File

@ -110,7 +110,7 @@ vad_handle_t vad_create(vad_mode_t vad_mode);
* - NULL: Create failed * - NULL: Create failed
* - Others: The instance of VAD * - Others: The instance of VAD
*/ */
vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len); vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms);
/** /**
* @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
@ -138,6 +138,13 @@ vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz,
*/ */
vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data); vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data);
/**
* @brief Reset trigger state as Silence
*
* @param handle The instance of VAD.
*/
void vad_reset_trigger(vad_handle_t handle);
/** /**
* @brief Free the VAD instance * @brief Free the VAD instance
* *

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.