From a8b77d0795b83a4e8d9e713220df71b51d31bcc3 Mon Sep 17 00:00:00 2001 From: xysun Date: Fri, 14 Feb 2025 14:51:53 +0800 Subject: [PATCH] docs: add migration guide --- README.md | 8 +++- docs/en/audio_front_end/README.rst | 11 +++-- docs/en/audio_front_end/index.rst | 3 +- docs/en/audio_front_end/migration_guide.rst | 45 +++++++++++++++++++ docs/en/index.rst | 2 +- docs/en/vadnet/{readme.rst => README.rst} | 6 ++- docs/zh_CN/audio_front_end/README.rst | 11 +++-- docs/zh_CN/audio_front_end/index.rst | 3 +- .../zh_CN/audio_front_end/migration_guide.rst | 44 ++++++++++++++++++ docs/zh_CN/index.rst | 2 +- docs/zh_CN/vadnet/{readme.rst => README.rst} | 12 ++--- 11 files changed, 128 insertions(+), 19 deletions(-) create mode 100644 docs/en/audio_front_end/migration_guide.rst rename docs/en/vadnet/{readme.rst => README.rst} (92%) create mode 100644 docs/zh_CN/audio_front_end/migration_guide.rst rename docs/zh_CN/vadnet/{readme.rst => README.rst} (84%) diff --git a/README.md b/README.md index 3e291c8..df3c1d9 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,11 @@ These algorithms are provided in the form of a component, so they can be integra ESP32-S3/ESP32-P4 are recommended, which support AI instructions and larger, high-speed octal SPI PSRAM. The new algorithms will no longer support ESP32 chips. +News +---- + +[14/2/2025]: We release **ESP-SR V2.0**. [Migration from ESP-SR V1.* to ESP-SR V2.*](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/audio_front_end/migration_guide.html) +[13/2/2025]: We release **VADNet**, a voice activaty detection model. You can use it to replace the WebRTC VAD and improve the performance. ## Wake Word Engine @@ -62,7 +67,7 @@ The following wake words are supported in esp-sr: |小鸭小鸭 | | wn9_xiaoyaxiaoya_tts2 | |璃奈板 | | wn9_linaiban_tts2 | |小酥肉 | | wn9_xiaosurou_tts2 | -|小宇同学 | | wn9_小宇同学_tts2 | +|小宇同学 | | wn9_xiaoyutongxue_tts2 | *NOTE:* `_tts` suffix means this WakeNet model is trained by TTS samples. `_tts2` suffix means this WakeNet model is trained by TTS Pipeline V2. @@ -85,7 +90,6 @@ Espressif Audio Front-End **AFE** integrates AEC (Acoustic Echo Cancellation), V Our two-mic Audio Front-End (AFE) have been qualified as a “Software Audio Front-End Solution” for [Amazon Alexa Built-in devices](https://developer.amazon.com/en-US/alexa/solution-providers/dev-kits#software-audio-front-end-dev-kits). -Now AFE V2.0 has been released, which is more efficient than AFE V1.0. and supports more models. **In order to achieve optimal performance:** diff --git a/docs/en/audio_front_end/README.rst b/docs/en/audio_front_end/README.rst index 5f50a52..eadd4c8 100644 --- a/docs/en/audio_front_end/README.rst +++ b/docs/en/audio_front_end/README.rst @@ -67,7 +67,7 @@ The ``input_format`` parameter specifies the arrangement of audio channels in th +-----------+---------------------+ **Example:** -``"MMNR"`` Indicates four channels: two microphone channels, one unused channel, and one playback reference channel. +``"MMNR"`` Indicates four channels inorder : microphone channel, microphone channel, unused channel, and playback reference channel. .. note:: @@ -126,11 +126,11 @@ Input audio data to the AFE for processing. The input data must match the ``inpu Step 4: Fetch Processed Audio ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -Retrieve the processed single-channel audio output: +Retrieve the processed single-channel audio data and detection states: .. code-block:: c - afe_fetch_result_t *result = fetch(afe_data); + afe_fetch_result_t *result = afe_handle->fetch(afe_data); int16_t *processed_audio = result->data; vad_state_t vad_state = result->vad_state; wakenet_state_t wakeup_state = result->wakeup_state; @@ -140,6 +140,11 @@ Retrieve the processed single-channel audio output: int16_t *vad_cache = result->vad_cache; } +.. code-block:: c + + // get the processed audio with specified delay, default delay is 2000 ms + afe_fetch_result_t *result = afe_handle->fetch_with_delay(afe_data, 100 / portTICK_PERIOD_MS); + Resource Occupancy ------------------ diff --git a/docs/en/audio_front_end/index.rst b/docs/en/audio_front_end/index.rst index 4fff541..f999c97 100644 --- a/docs/en/audio_front_end/index.rst +++ b/docs/en/audio_front_end/index.rst @@ -5,4 +5,5 @@ AFE Audio Front-end :maxdepth: 1 AFE Introduction - Espressif Microphone Design Guidelines \ No newline at end of file + Espressif Microphone Design Guidelines + Migration from V1.* to V2.* \ No newline at end of file diff --git a/docs/en/audio_front_end/migration_guide.rst b/docs/en/audio_front_end/migration_guide.rst new file mode 100644 index 0000000..6a84b47 --- /dev/null +++ b/docs/en/audio_front_end/migration_guide.rst @@ -0,0 +1,45 @@ +Migration from V1.* to V2.* +=========================== + +:link_to_translation:`zh_CN:[中文]` + +Configuration and Initialization +-------------------------------- + +- 1. The legacy configuration initialization method AFE_CONFIG_DEFAULT() has been removed. Please use ``afe_config_init`` to initialize configurations. Modifications can still be made after initialization: + + .. code-block:: c + + afe_config_t *afe_config = afe_config_init("MMNR", models, AFE_TYPE_SR, AFE_MODE_HIGH_PERF); + +- 2. ESP_AFE_SR_HANDLE and ESP_AFE_VC_HANDLE have been removed. Use ``esp_afe_handle_from_config`` to create instances: + + .. code-block:: c + + esp_afe_sr_iface_t *afe_handle = esp_afe_handle_from_config(afe_config); + +Input Data Format Changes +--------------------------- + +The new version supports more flexible input formats via the ``input_format`` parameter. This parameter defines the arrangement of audio channels in the input data. + +So You just need to provide the correct ``input_format`` and do not need to rearrange audio data. Each character in the string represents a channel type: + ++-----------+---------------------+ +| Character | Description | ++===========+=====================+ +| ``M`` | Microphone channel | ++-----------+---------------------+ +| ``R`` | Playback reference | +| | channel | ++-----------+---------------------+ +| ``N`` | Unused or unknown | +| | channel | ++-----------+---------------------+ + +**Example:** +``MMNR`` indicates four channels, ordered as: microphone channel, microphone channel, unused channel, playback reference channel. + +.. note:: + + AFE v2.0 introduces additional configuration options. For details, refer to :doc:`AFE <../audio_front_end/README>` and :doc:`VAD <../vadnet/README>`. diff --git a/docs/en/index.rst b/docs/en/index.rst index b8135df..a6209e9 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -16,7 +16,7 @@ ESP-SR User Guide Getting Started Audio Front-end (AFE) Wake Word WakeNet - VAD Model vadnet + VAD Model vadnet Speech Command Word MultiNet Speech Synthesis (Only Supports Chinese Language) Flashing Models diff --git a/docs/en/vadnet/readme.rst b/docs/en/vadnet/README.rst similarity index 92% rename from docs/en/vadnet/readme.rst rename to docs/en/vadnet/README.rst index e01e3d0..afd2194 100644 --- a/docs/en/vadnet/readme.rst +++ b/docs/en/vadnet/README.rst @@ -8,7 +8,7 @@ VADNet is a Voice Activaty Detection model built upon neural network for low-pow Overview -------- -VADNet uses a model structure and data processing flow similar to WakeNet, for more details, you can refer to :doc:`AFE <../wake_word_engine/README>` +VADNet uses a model structure and data processing flow similar to WakeNet, for more details, you can refer to :doc:`WakeNet <../wake_word_engine/README>` VADNet is trained by about 5,000 hours of Chinese data, 5,000 hours of English data, and 5,000 hours of multilingual data. @@ -18,7 +18,9 @@ Use VADNet - Select VADNet model - To select VADNet model, please refer to Section :doc:`Flashing Models <../flash_model/README>` . + :: + idf.py menuconfig + ESP Speech Recognition -> Select voice activity detection -> voice activity detection (vadnet1 medium). - Run VADNet diff --git a/docs/zh_CN/audio_front_end/README.rst b/docs/zh_CN/audio_front_end/README.rst index e476174..83dc8ff 100644 --- a/docs/zh_CN/audio_front_end/README.rst +++ b/docs/zh_CN/audio_front_end/README.rst @@ -64,7 +64,7 @@ AFE 声学前端算法框架 +-----------+---------------------+ **示例:** -``"MMNR"``:表示四通道排列,包含两个麦克风通道、一个未使用通道和一个播放参考通道。 +``"MMNR"``:表示四通道排列,依次为麦克风通道、麦克风通道、未使用通道和播放参考通道。 .. note:: @@ -121,11 +121,11 @@ AFE 声学前端算法框架 步骤4:获取处理结果 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -获取处理后的单通道音频输出: +获取处理后的单通道音频输出和检测状态: .. code-block:: c - afe_fetch_result_t *result = fetch(afe_data); + afe_fetch_result_t *result = afe_handle->fetch(afe_data); int16_t *processed_audio = result->data; vad_state_t vad_state = result->vad_state; wakenet_state_t wakeup_state = result->wakeup_state; @@ -135,6 +135,11 @@ AFE 声学前端算法框架 int16_t *vad_cache = result->vad_cache; } +.. code-block:: c + + // get the processed audio with specified delay, default delay is 2000 ms + afe_fetch_result_t *result = afe_handle->fetch_with_delay(afe_data, 100 / portTICK_PERIOD_MS); + 资源占用 ------------------ diff --git a/docs/zh_CN/audio_front_end/index.rst b/docs/zh_CN/audio_front_end/index.rst index 6d7d13f..9d2cd3f 100644 --- a/docs/zh_CN/audio_front_end/index.rst +++ b/docs/zh_CN/audio_front_end/index.rst @@ -5,4 +5,5 @@ AFE 声学前端 :maxdepth: 1 AFE 声学前端模型简介 - 乐鑫麦克风设计指南 \ No newline at end of file + 乐鑫麦克风设计指南 + 升级指南 \ No newline at end of file diff --git a/docs/zh_CN/audio_front_end/migration_guide.rst b/docs/zh_CN/audio_front_end/migration_guide.rst new file mode 100644 index 0000000..c51b9f1 --- /dev/null +++ b/docs/zh_CN/audio_front_end/migration_guide.rst @@ -0,0 +1,44 @@ +rstCopy +从 V1.* 迁移到 V2.* +=========================== + +:link_to_translation:`en:[English]` + +配置和初始化 +-------------------------------- + +- 1. 旧的配置初始化方法 AFE_CONFIG_DEFAULT() 已被移除。请使用 ``afe_config_init`` 来初始化配置。初始化后仍可进行修改: + + .. code-block:: c + + afe_config_t *afe_config = afe_config_init("MMNR", models, AFE_TYPE_SR, AFE_MODE_HIGH_PERF); + +- 2. ESP_AFE_SR_HANDLE 和 ESP_AFE_VC_HANDLE 已被移除。使用 ``esp_afe_handle_from_config`` 来创建实例: + + .. code-block:: c + + esp_afe_sr_iface_t *afe_handle = esp_afe_handle_from_config(afe_config); + +输入数据格式修改 +--------------------------- + +新版本通过 ``input_format`` 参数支持更灵活的输入格式。此参数定义了输入数据中音频通道的排列方式。 + +因此,您只需要提供正确的 ``input_format``,无需重新排列音频数据。字符串中的每个字符代表一种通道类型: + ++-----------+---------------------+ +| 字符 | 描述 | ++===========+=====================+ +| ``M`` | 麦克风通道 | ++-----------+---------------------+ +| ``R`` | 播放参考通道 | ++-----------+---------------------+ +| ``N`` | 未使用或未知通道 | ++-----------+---------------------+ + +**示例:** +``MMNR`` 表示四个通道,依次为:麦克风通道、麦克风通道、未使用通道、播放参考通道。 + +.. note:: + + AFE v2.0 引入了额外的配置选项。详细信息请参阅 :doc:`AFE <../audio_front_end/README>` 和 :doc:`VAD <../vadnet/README>`。 \ No newline at end of file diff --git a/docs/zh_CN/index.rst b/docs/zh_CN/index.rst index 9edf5f0..b482f15 100644 --- a/docs/zh_CN/index.rst +++ b/docs/zh_CN/index.rst @@ -17,7 +17,7 @@ ESP-SR 用户指南 入门指南 AFE 声学前端算法 语音唤醒 WakeNet - VAD vadnet + VAD vadnet 语音指令 MultiNet 语音合成(仅支持中文) 模型加载 diff --git a/docs/zh_CN/vadnet/readme.rst b/docs/zh_CN/vadnet/README.rst similarity index 84% rename from docs/zh_CN/vadnet/readme.rst rename to docs/zh_CN/vadnet/README.rst index 6285515..ec6535c 100644 --- a/docs/zh_CN/vadnet/readme.rst +++ b/docs/zh_CN/vadnet/README.rst @@ -8,7 +8,7 @@ VADNet 是一个基于神经网络的语音活动检测模型,专为低功耗 概述 -------- -VADNet 采用了与 WakeNet 相似的模型结构和数据处理流程,更多实现细节可参考 :doc:`音频前端处理模块 <../audio_front_end/README>` 中的说明。 +VADNet 采用了与 WakeNet 相似的模型结构和数据处理流程,更多实现细节可参考 :doc:`WakeNet <../wake_word_engine/README>` 中的说明。 VADNet 训练数据包括了大约5000小时中文数据, 5000 小时英文数据,还有5000小时的多语言数据。 @@ -17,7 +17,9 @@ VADNet 训练数据包括了大约5000小时中文数据, 5000 小时英文数 - 选择VADNet模型 - 选择VADNet模型请参考 :doc:`模型烧录指南 <../flash_model/README>` 。 + :: + idf.py menuconfig + ESP Speech Recognition -> Select voice activity detection -> voice activity detection (vadnet1 medium). - 运行VADNet @@ -54,11 +56,11 @@ VADNet 训练数据包括了大约5000小时中文数据, 5000 小时英文数 afe_fetch_result_t* result = afe_handle->fetch(afe_data); if (result->vad_cache_size > 0) { - printf("vad缓存大小: %d\n", result->vad_cache_size); - fwrite(result->vad_cache, 1, result->vad_cache_size, fp); // 写入缓存数据 + printf("vad cache size: %d\n", result->vad_cache_size); + fwrite(result->vad_cache, 1, result->vad_cache_size, fp); } - printf("vad状态: %s\n", res->vad_state==VAD_SILENCE ? "环境噪声" : "语音活动"); + printf("vad state: %s\n", res->vad_state==VAD_SILENCE ? "noise" : "speech"); 资源占用