diff --git a/docs/en/audio_front_end/README.rst b/docs/en/audio_front_end/README.rst index 1cd5027..9b42514 100644 --- a/docs/en/audio_front_end/README.rst +++ b/docs/en/audio_front_end/README.rst @@ -54,15 +54,17 @@ Input Format Definition The ``input_format`` parameter specifies the arrangement of audio channels in the input data. Each character in the string represents a channel type: -+-----------+-------------+ -| Character | Description | -+===========+=============+ -| ``M`` | Microphone channel | -+-----------+-------------+ -| ``R`` | Playback reference channel | -+-----------+-------------+ -| ``N`` | Unused or unknown channel | -+-----------+-------------+ ++-----------+---------------------+ +| Character | Description | ++===========+=====================+ +| ``M`` | Microphone channel | ++-----------+---------------------+ +| ``R`` | Playback reference | +| | channel | ++-----------+---------------------+ +| ``N`` | Unused or unknown | +| | channel | ++-----------+---------------------+ **Example:** - ``"MMNR"``: Indicates four channels: two microphone channels, one unused channel, and one playback reference channel. @@ -73,6 +75,9 @@ The ``input_format`` parameter specifies the arrangement of audio channels in th Using the AFE Framework ---------------------------- +Based on the ``menuconfig`` -> ``ESP Speech Recognition``, select the required AFE (Analog Front End) models, such as the WakeNet model, VAD (Voice Activity Detection) model, NS (Noise Suppression) model, etc., and then call the AFE framework in the code using the following steps. +For reference, you can check the code in :project_file:`test_apps/esp-sr/main/test_afe.cpp`. + Step 1: Initialize AFE Configuration ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -80,6 +85,7 @@ Get the default configuration using ``afe_config_init()`` and customize paramete .. code-block:: c + srmodel_list_t *models = esp_srmodel_init("model"); afe_config_t *afe_config = afe_config_init("MMNR", models, AFE_TYPE_SR, AFE_MODE_HIGH_PERF); - **``input_format``**: Define the channel arrangement (e.g., ``"MMNR"``). diff --git a/docs/zh_CN/audio_front_end/README.rst b/docs/zh_CN/audio_front_end/README.rst index d028800..2826a3a 100644 --- a/docs/zh_CN/audio_front_end/README.rst +++ b/docs/zh_CN/audio_front_end/README.rst @@ -53,15 +53,15 @@ AFE 声学前端算法框架 ``input_format`` 参数定义了输入数据中音频通道的排列方式。字符串中的每个字符代表一个通道类型: -+-----------+-------------+ -| 字符 | 描述 | -+===========+=============+ -| ``M`` | 麦克风通道 | -+-----------+-------------+ -| ``R`` | 播放参考通道 | -+-----------+-------------+ -| ``N`` | 未使用或未知通道 | -+-----------+-------------+ ++-----------+---------------------+ +| 字符 | 描述 | ++===========+=====================+ +| ``M`` | 麦克风通道 | ++-----------+---------------------+ +| ``R`` | 播放参考通道 | ++-----------+---------------------+ +| ``N`` | 未使用或未知通道 | ++-----------+---------------------+ **示例:** - ``"MMNR"``:表示四通道排列,包含两个麦克风通道、一个未使用通道和一个播放参考通道。 @@ -71,6 +71,8 @@ AFE 声学前端算法框架 使用AFE框架 ---------------------------- +根据 ``menuconfig`` -> ``ESP Speech Recognition`` 选择需要的AFE的模型,比如WakeNet模型,VAD模型, NS模型等,然后在代码中使用以下步骤调用AFE框架。 +代码可以参考 :project_file:`test_apps/esp-sr/main/test_afe.cpp`。 步骤1:初始化AFE配置 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -78,7 +80,8 @@ AFE 声学前端算法框架 使用 ``afe_config_init()`` 获取默认配置并根据需求调整参数: .. code-block:: c - + + srmodel_list_t *models = esp_srmodel_init("model"); afe_config_t *afe_config = afe_config_init("MMNR", models, AFE_TYPE_SR, AFE_MODE_HIGH_PERF); - **``input_format``**:定义通道排列(如 ``"MMNR"``)。 diff --git a/include/esp32p4/esp_afe_config.h b/include/esp32p4/esp_afe_config.h index 16906bd..00ac15b 100644 --- a/include/esp32p4/esp_afe_config.h +++ b/include/esp32p4/esp_afe_config.h @@ -110,6 +110,8 @@ typedef struct { int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: // 1000 ms + int vad_delay_ms; // The delay of the first speech frame in ms, default: 128 ms + // If you find vad cache can not cover all speech, please increase this value. bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false diff --git a/include/esp32p4/esp_afe_sr_iface.h b/include/esp32p4/esp_afe_sr_iface.h index 580eed9..ffc6ce2 100644 --- a/include/esp32p4/esp_afe_sr_iface.h +++ b/include/esp32p4/esp_afe_sr_iface.h @@ -141,12 +141,12 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe); typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name); /** - * @brief Enable VAD algorithm. + * @brief Reset one function/module/algorithm. * * @param afe The AFE_SR object to query - * @return -1: fail, 0: disabled, 1: enabled + * @return -1: fail, 1: success */ -typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe); +typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe); /** * @brief Disable one function/module/algorithm. @@ -204,6 +204,7 @@ typedef struct { esp_afe_sr_iface_op_enable_func_t enable_se; esp_afe_sr_iface_op_disable_func_t disable_vad; esp_afe_sr_iface_op_enable_func_t enable_vad; + esp_afe_sr_iface_op_reset_op_t reset_vad; esp_afe_sr_iface_op_disable_func_t disable_ns; esp_afe_sr_iface_op_enable_func_t enable_ns; esp_afe_sr_iface_op_disable_func_t disable_agc; diff --git a/include/esp32p4/esp_vad.h b/include/esp32p4/esp_vad.h index f3c5dd4..0c7f734 100644 --- a/include/esp32p4/esp_vad.h +++ b/include/esp32p4/esp_vad.h @@ -110,7 +110,7 @@ vad_handle_t vad_create(vad_mode_t vad_mode); * - NULL: Create failed * - Others: The instance of VAD */ -vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len); +vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms); /** * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. @@ -138,6 +138,13 @@ vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, */ vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data); +/** + * @brief Reset trigger state as Silence + * + * @param handle The instance of VAD. + */ +void vad_reset_trigger(vad_handle_t handle); + /** * @brief Free the VAD instance * diff --git a/include/esp32s3/esp_afe_config.h b/include/esp32s3/esp_afe_config.h index 16906bd..00ac15b 100644 --- a/include/esp32s3/esp_afe_config.h +++ b/include/esp32s3/esp_afe_config.h @@ -110,6 +110,8 @@ typedef struct { int vad_min_speech_ms; // The minimum duration of speech in ms. It should be bigger than 32 ms, default: 128 ms int vad_min_noise_ms; // The minimum duration of noise or silence in ms. It should be bigger than 64 ms, default: // 1000 ms + int vad_delay_ms; // The delay of the first speech frame in ms, default: 128 ms + // If you find vad cache can not cover all speech, please increase this value. bool vad_mute_playback; // If true, the playback will be muted for vad detection. default: false bool vad_enable_channel_trigger; // If true, the vad will be used to choose the channel id. default: false diff --git a/include/esp32s3/esp_afe_sr_iface.h b/include/esp32s3/esp_afe_sr_iface.h index 580eed9..ffc6ce2 100644 --- a/include/esp32s3/esp_afe_sr_iface.h +++ b/include/esp32s3/esp_afe_sr_iface.h @@ -141,12 +141,12 @@ typedef int (*esp_afe_sr_iface_op_reset_buffer_t)(esp_afe_sr_data_t *afe); typedef int (*esp_afe_sr_iface_op_set_wakenet_t)(esp_afe_sr_data_t *afe, char *model_name); /** - * @brief Enable VAD algorithm. + * @brief Reset one function/module/algorithm. * * @param afe The AFE_SR object to query - * @return -1: fail, 0: disabled, 1: enabled + * @return -1: fail, 1: success */ -typedef int (*esp_afe_sr_iface_op_enable_vad_t)(esp_afe_sr_data_t *afe); +typedef int (*esp_afe_sr_iface_op_reset_op_t)(esp_afe_sr_data_t *afe); /** * @brief Disable one function/module/algorithm. @@ -204,6 +204,7 @@ typedef struct { esp_afe_sr_iface_op_enable_func_t enable_se; esp_afe_sr_iface_op_disable_func_t disable_vad; esp_afe_sr_iface_op_enable_func_t enable_vad; + esp_afe_sr_iface_op_reset_op_t reset_vad; esp_afe_sr_iface_op_disable_func_t disable_ns; esp_afe_sr_iface_op_enable_func_t enable_ns; esp_afe_sr_iface_op_disable_func_t disable_agc; diff --git a/include/esp32s3/esp_vad.h b/include/esp32s3/esp_vad.h index f3c5dd4..0c7f734 100644 --- a/include/esp32s3/esp_vad.h +++ b/include/esp32s3/esp_vad.h @@ -110,7 +110,7 @@ vad_handle_t vad_create(vad_mode_t vad_mode); * - NULL: Create failed * - Others: The instance of VAD */ -vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_len, int min_noise_len); +vad_handle_t vad_create_with_param(vad_mode_t vad_mode, int sample_rate, int one_frame_ms, int min_speech_ms, int min_noise_ms); /** * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking. @@ -138,6 +138,13 @@ vad_state_t vad_process(vad_handle_t handle, int16_t *data, int sample_rate_hz, */ vad_state_t vad_process_with_trigger(vad_handle_t handle, int16_t *data); +/** + * @brief Reset trigger state as Silence + * + * @param handle The instance of VAD. + */ +void vad_reset_trigger(vad_handle_t handle); + /** * @brief Free the VAD instance * diff --git a/lib/esp32p4/libesp_audio_front_end.a b/lib/esp32p4/libesp_audio_front_end.a index 705fa4a..cce5d29 100644 Binary files a/lib/esp32p4/libesp_audio_front_end.a and b/lib/esp32p4/libesp_audio_front_end.a differ diff --git a/lib/esp32p4/libesp_audio_processor.a b/lib/esp32p4/libesp_audio_processor.a index 4487112..f32dd0f 100644 Binary files a/lib/esp32p4/libesp_audio_processor.a and b/lib/esp32p4/libesp_audio_processor.a differ diff --git a/lib/esp32p4/libmultinet.a b/lib/esp32p4/libmultinet.a index 1345a5c..16dca4f 100644 Binary files a/lib/esp32p4/libmultinet.a and b/lib/esp32p4/libmultinet.a differ diff --git a/lib/esp32p4/libvadnet.a b/lib/esp32p4/libvadnet.a index 58bc2b0..8c3424e 100644 Binary files a/lib/esp32p4/libvadnet.a and b/lib/esp32p4/libvadnet.a differ diff --git a/lib/esp32p4/libwakenet.a b/lib/esp32p4/libwakenet.a index 20890d1..6eba4cd 100644 Binary files a/lib/esp32p4/libwakenet.a and b/lib/esp32p4/libwakenet.a differ diff --git a/lib/esp32s3/libesp_audio_front_end.a b/lib/esp32s3/libesp_audio_front_end.a index cad62f2..4089104 100644 Binary files a/lib/esp32s3/libesp_audio_front_end.a and b/lib/esp32s3/libesp_audio_front_end.a differ diff --git a/lib/esp32s3/libesp_audio_processor.a b/lib/esp32s3/libesp_audio_processor.a index 4df5207..d113daf 100644 Binary files a/lib/esp32s3/libesp_audio_processor.a and b/lib/esp32s3/libesp_audio_processor.a differ diff --git a/lib/esp32s3/libmultinet.a b/lib/esp32s3/libmultinet.a index 891fac2..62e7576 100644 Binary files a/lib/esp32s3/libmultinet.a and b/lib/esp32s3/libmultinet.a differ diff --git a/lib/esp32s3/libnsnet.a b/lib/esp32s3/libnsnet.a index 81e0015..f396b67 100644 Binary files a/lib/esp32s3/libnsnet.a and b/lib/esp32s3/libnsnet.a differ diff --git a/lib/esp32s3/libvadnet.a b/lib/esp32s3/libvadnet.a index 40949a8..533e9bd 100644 Binary files a/lib/esp32s3/libvadnet.a and b/lib/esp32s3/libvadnet.a differ diff --git a/lib/esp32s3/libwakenet.a b/lib/esp32s3/libwakenet.a index b74c5a8..0ff21b1 100644 Binary files a/lib/esp32s3/libwakenet.a and b/lib/esp32s3/libwakenet.a differ