From a9efcbb4d59919bf0ae7dc570a6da0acf9a95962 Mon Sep 17 00:00:00 2001
From: "liying@espressif.com" <liying@espressif.com>
Date: Tue, 17 Jan 2023 11:11:07 +0800
Subject: [PATCH] fix doc menu

add tts into doc folder

add documentation feedback id
---
 .../img => docs/_static}/esp_chinese_tts.png  | Bin
 docs/conf_common.py                           |   2 +-
 docs/en/_templates/layout.html                |   2 +-
 docs/en/audio_front_end/README.rst            |   8 +-
 docs/en/benchmark/README.rst                  |  52 +++++++---
 docs/en/flash_model/README.rst                |   2 +-
 docs/en/getting_started/readme.rst            |   2 +-
 docs/en/index.rst                             |   1 +
 docs/en/speech_command_recognition/README.rst |   6 ++
 docs/en/speech_synthesis/readme.rst           |  74 ++++++++++++++
 docs/en/wake_word_engine/README.rst           |   6 +-
 docs/zh_CN/_templates/layout.html             |   2 +-
 docs/zh_CN/audio_front_end/README.rst         |  10 +-
 docs/zh_CN/benchmark/README.rst               |  52 +++++++---
 docs/zh_CN/getting_started/readme.rst         |   4 +-
 docs/zh_CN/index.rst                          |   1 +
 .../speech_command_recognition/README.rst     |   6 ++
 docs/zh_CN/speech_synthesis/readme.rst        |  73 ++++++++++++++
 docs/zh_CN/wake_word_engine/README.rst        |   2 +-
 esp-tts/README.md                             |  86 +---------------
 esp-tts/README_en.md                          |  92 ------------------
 21 files changed, 263 insertions(+), 220 deletions(-)
 rename {esp-tts/img => docs/_static}/esp_chinese_tts.png (100%)
 create mode 100644 docs/en/speech_synthesis/readme.rst
 create mode 100644 docs/zh_CN/speech_synthesis/readme.rst
 delete mode 100644 esp-tts/README_en.md
diff --git a/esp-tts/img/esp_chinese_tts.png b/docs/_static/esp_chinese_tts.png
similarity index 100%
rename from esp-tts/img/esp_chinese_tts.png
rename to docs/_static/esp_chinese_tts.png
diff --git a/docs/conf_common.py b/docs/conf_common.py
index b23d2e2..dd467b4 100755
--- a/docs/conf_common.py
+++ b/docs/conf_common.py
@@ -21,7 +21,7 @@ project_slug = 'esp-sr'
 
 # Contains info used for constructing target and version selector
 # Can also be hosted externally, see esp-idf for example
-versions_url = '_static/docs_version.js'
+versions_url = './_static/docs_version.js'
 
 # Final PDF filename will contains target and version
 pdf_file_prefix = u'esp-sr'
diff --git a/docs/en/_templates/layout.html b/docs/en/_templates/layout.html
index df6a184..5e7afe9 100644
--- a/docs/en/_templates/layout.html
+++ b/docs/en/_templates/layout.html
@@ -1,4 +1,4 @@
 {% extends '!layout.html' %}
 {% block comments %}
-<p style="text-align:center"><a href="https://www.espressif.com/en/company/documents/documentation_feedback?docId=4419&sections={{ title|striptags|e }} ({{ pagename }})&version={{ release }} ({{ version }})">Provide feedback about this document</a></p>
+<p style="text-align:center"><a href="https://www.espressif.com/en/company/documents/documentation_feedback?docId=6473&sections={{ title|striptags|e }} ({{ pagename }})&version={{ release }} ({{ version }})">Provide feedback about this document</a></p>
 {% endblock %}
diff --git a/docs/en/audio_front_end/README.rst b/docs/en/audio_front_end/README.rst
index 05a931b..48969ec 100644
--- a/docs/en/audio_front_end/README.rst
+++ b/docs/en/audio_front_end/README.rst
@@ -404,4 +404,10 @@ The usage of AEC is similar to that of WakeNet. Users can disable or enable AEC
         int wake_word_length;                   // the length of wake word. It's unit is the number of samples.
         int ret_value;                          // the return state of fetch function
         void* reserved;                         // reserved for future use
-        } afe_fetch_result_t;
\ No newline at end of file
+        } afe_fetch_result_t;
+
+
+Resource Occupancy
+------------------
+
+For the resource occupancy for this model, see :doc:`Resource Occupancy <../benchmark/README>`.
\ No newline at end of file
diff --git a/docs/en/benchmark/README.rst b/docs/en/benchmark/README.rst
index 5637e26..a335d64 100644
--- a/docs/en/benchmark/README.rst
+++ b/docs/en/benchmark/README.rst
@@ -12,7 +12,7 @@ Resource Occupancy
 .. only:: esp32
 
     +-----------------+-----------------+-----------------+-----------------+
-    | algorithm Type  | RAM             | Average cpu     | Frame Length    |
+    | Algorithm Type  | RAM             | Average cpu     | Frame Length    |
     |                 |                 | loading(compute |                 |
     |                 |                 | with 2 cores)   |                 |
     +=================+=================+=================+=================+
@@ -26,7 +26,7 @@ Resource Occupancy
 .. only:: esp32s3
 
     +-----------------+-----------------+-----------------+-----------------+
-    | algorithm Type  | RAM             | Average cpu     | Frame Length    |
+    | Algorithm Type  | RAM             | Average cpu     | Frame Length    |
     |                 |                 | loading(compute |                 |
     |                 |                 | with 2 cores)   |                 |
     +=================+=================+=================+=================+
@@ -155,16 +155,38 @@ Resource Occupancy
 Performance Test
 ~~~~~~~~~~~~~~~~
 
-+-----------+-----------+-----------+-----------+-----------+
-| Model     | Distance  | Quiet     | S         | Speech    |
-| Type      |           |           | tationary | Noise     |
-|           |           |           | Noise     | (SNR = 4  |
-|           |           |           | (SNR = 4  | dB)       |
-|           |           |           | dB)       |           |
-+===========+===========+===========+===========+===========+
-| MultiNet  | 3 m       | 98%       | 93%       | 92%       |
-| 4         |           |           |           |           |
-+-----------+-----------+-----------+-----------+-----------+
-| MultiNet  | 3 m       | 94%       | 92%       | 91%       |
-| 4 Q8      |           |           |           |           |
-+-----------+-----------+-----------+-----------+-----------+
\ No newline at end of file
++-----------+-----------+----------+------------+-----------+
+| Model     | Distance  | Quiet    | Stationary | Speech    |
+| Type      |           |          | Noise      | Noise     |
+|           |           |          | (SNR = 4   | (SNR = 4  |
+|           |           |          | dB)        | dB)       |
++===========+===========+==========+============+===========+
+| MultiNet  | 3 m       | 98%      | 93%        | 92%       |
+| 4         |           |          |            |           |
++-----------+-----------+----------+------------+-----------+
+| MultiNet  | 3 m       | 94%      | 92%        | 91%       |
+| 4 Q8      |           |          |            |           |
++-----------+-----------+----------+------------+-----------+
+
+
+TTS
+---
+
+Resource Occupancy
+~~~~~~~~~~~~~~~~~~
+
+Flash image size: 2.2 MB
+
+RAM runtime: 20 KB
+
+
+Performance Test
+~~~~~~~~~~~~~~~~
+
+CPU loading test (ESP32 @240 MHz):
+
++------------------------------+------+------+------+------+------+------+
+| Speech Rate                  | 0    | 1    | 2    | 3    | 4    | 5    |
++==============================+======+======+======+======+======+======+
+| Times faster than real time  | 4.5  | 3.2  | 2.9  | 2.5  | 2.2  | 1.8  |
++------------------------------+------+------+------+------+------+------+
\ No newline at end of file
diff --git a/docs/en/flash_model/README.rst b/docs/en/flash_model/README.rst
index 594d51f..b245380 100644
--- a/docs/en/flash_model/README.rst
+++ b/docs/en/flash_model/README.rst
@@ -18,7 +18,7 @@ To use our models in your project, you need to flash these models. Currently, ES
     -  Load directly from SIP Flash File System (SPIFFS)
     -  Load from external SD card
 
-    So that on ESP32S3 you can:
+    So that on ESP32-S3 you can:
 
         -  Greatly reduce the size of the user application APP BIN
         -  Supports the selection of up to two wake words
diff --git a/docs/en/getting_started/readme.rst b/docs/en/getting_started/readme.rst
index 366ae6b..0e11712 100644
--- a/docs/en/getting_started/readme.rst
+++ b/docs/en/getting_started/readme.rst
@@ -13,7 +13,7 @@ ESP-SR includes the following modules:
 * :doc:`Audio Front-end AFE <../audio_front_end/README>`
 * :doc:`Wake Word Engine WakeNet <../wake_word_engine/README>`
 * :doc:`Speech Command Word Recognition MultiNet <../speech_command_recognition/README>`
-* Speech Synthesis (only supports Chinese language)
+* :doc:`Speech Synthesis (only supports Chinese language) <../speech_synthesis/readme>`
 
 What You Need
 -------------
diff --git a/docs/en/index.rst b/docs/en/index.rst
index 00d0480..2e0b19f 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -17,6 +17,7 @@ ESP-SR User Guide
     Audio Front-end (AFE) <audio_front_end/index>
     Wake Word WakeNet <wake_word_engine/index>
     Speech Command Word MultiNet <speech_command_recognition/README>
+    Speech Synthesis (Only Supports Chinese Language) <speech_synthesis/readme>
     Flashing Models <flash_model/README>
     Resource Overhead <benchmark/README>
     Test Report <test_report/README>
diff --git a/docs/en/speech_command_recognition/README.rst b/docs/en/speech_command_recognition/README.rst
index 935cb53..44c59f5 100644
--- a/docs/en/speech_command_recognition/README.rst
+++ b/docs/en/speech_command_recognition/README.rst
@@ -228,6 +228,12 @@ Therefore:
 * Single recognition mode: exit the speech recognition when the return status is ``ESP_MN_STATE_DETECTED``
 * Continuous recognition: exit the speech recognition when the return status is ``ESP_MN_STATE_TIMEOUT``
 
+Resource Occupancy
+------------------
+
+For the resource occupancy for this model, see :doc:`Resource Occupancy <../benchmark/README>`.
+
+
 Other configurations
 -----------------------
 
diff --git a/docs/en/speech_synthesis/readme.rst b/docs/en/speech_synthesis/readme.rst
new file mode 100644
index 0000000..12a3b7a
--- /dev/null
+++ b/docs/en/speech_synthesis/readme.rst
@@ -0,0 +1,74 @@
+TTS Speech Synthesis Model
+==========================
+
+:link_to_translation:`zh_CN:[中文]`
+
+Espressif TTS speech synthesis model is a lightweight speech synthesis system designed for embedded systems, with the following main features:
+
+- Currently **Only supports Chinese language**
+- Input text is encoded in UTF-8
+- Streaming output, which reduces latency
+- Polyphonic pronunciation
+- Adjustable output speech rate
+- Digital broadcasting optimization
+- Customized sound set (coming soon)
+
+Overview
+--------
+
+Using a concatenative method, the current version of TTS includes the following components:
+
+- Parser: converts Chinese text (encoded in UTF-8) to phonemes.
+- Synthesizer: generates wave raw data from the phonemes provided by the parser and the sound set. Default output format: mono, 16 bit @ 16000 Hz.
+
+Workflow:
+
+.. figure:: ../../_static/esp_chinese_tts.png
+   :alt: chinese TTS
+
+Examples
+--------
+
+- :project_file:`esp-tts/samples/xiaoxin_speed1.wav` (voice=xiaoxin, speed=1): 欢迎使用乐鑫语音合成，支付宝收款 72.1 元，微信收款 643.12 元，扫码收款 5489.54 元
+- :project_file:`esp-tts/samples/S2_xiaole_speed2.wav` (voice=xiaole, speed=2): 支付宝收款 1111.11 元
+
+Programming Procedures
+----------------------
+
+.. code:: c
+
+   #include "esp_tts.h"
+   #include "esp_tts_voice_female.h"
+   #include "esp_partition.h"
+
+   /*** 1. create esp tts handle  ***/
+
+
+   // initial voice set from separate voice data partition
+
+   const esp_partition_t* part=esp_partition_find_first(ESP_PARTITION_TYPE_DATA, ESP_PARTITION_SUBTYPE_DATA_FAT, "voice_data");
+   if (part==0) printf("Couldn't find voice data partition!\n");
+   spi_flash_mmap_handle_t mmap;
+   uint16_t* voicedata;
+   esp_err_t err=esp_partition_mmap(part, 0, part->size, SPI_FLASH_MMAP_DATA, (const void**)&voicedata, &mmap);
+   esp_tts_voice_t *voice=esp_tts_voice_set_init(&esp_tts_voice_template, voicedata);
+
+   // 2. parse text and synthesis wave data
+   char *text="欢迎使用乐鑫语音合成";
+   if (esp_tts_parse_chinese(tts_handle, text)) {  // parse text into pinyin list
+       int len[1]={0};
+       do {
+           short *data=esp_tts_stream_play(tts_handle, len, 4); // streaming synthesis
+           i2s_audio_play(data, len[0]*2, portMAX_DELAY);  // i2s output
+       } while(len[0]>0);
+       i2s_zero_dma_buffer(0);
+   }
+
+
+See :project_file:`esp-tts/esp_tts_chinese/include/esp_tts.h` for API reference and see the `chinese_tts <https://github.com/espressif/esp-skainet/tree/master/examples/chinese_tts>`__ example in ESP-Skainet.
+
+
+Resource Occupancy
+------------------
+
+For the resource occupancy for this model, see :doc:`Resource Occupancy <../benchmark/README>`.
\ No newline at end of file
diff --git a/docs/en/wake_word_engine/README.rst b/docs/en/wake_word_engine/README.rst
index 1d750a1..9979f2b 100644
--- a/docs/en/wake_word_engine/README.rst
+++ b/docs/en/wake_word_engine/README.rst
@@ -98,7 +98,7 @@ Use WakeNet
         afe_handle->disable_wakenet(afe_data)
         afe_handle->enable_wakenet(afe_data)
 
-Resource Consumption
---------------------
+Resource Occupancy
+------------------
 
-Please refer to :doc:`Resource Consumption <../benchmark/README>` .
\ No newline at end of file
+For the resource occupancy for this model, see :doc:`Resource Occupancy <../benchmark/README>`.
\ No newline at end of file
diff --git a/docs/zh_CN/_templates/layout.html b/docs/zh_CN/_templates/layout.html
index de50eda..f778c98 100644
--- a/docs/zh_CN/_templates/layout.html
+++ b/docs/zh_CN/_templates/layout.html
@@ -1,4 +1,4 @@
 {% extends '!layout.html' %}
 {% block comments %}
-<p style="text-align:center"><a href="https://www.espressif.com/zh-hans/company/documents/documentation_feedback?docId=4846&sections={{ title|striptags|e }} ({{ pagename }})&version={{ release }} ({{ version }})">提供有关此文档的反馈</a></p>
+<p style="text-align:center"><a href="https://www.espressif.com/zh-hans/company/documents/documentation_feedback?docId=6475&sections={{ title|striptags|e }} ({{ pagename }})&version={{ release }} ({{ version }})">提供有关此文档的反馈</a></p>
 {% endblock %}
diff --git a/docs/zh_CN/audio_front_end/README.rst b/docs/zh_CN/audio_front_end/README.rst
index 847fe2d..b64c357 100644
--- a/docs/zh_CN/audio_front_end/README.rst
+++ b/docs/zh_CN/audio_front_end/README.rst
@@ -27,7 +27,7 @@ AEF 声学前端算法框架
     * - AGC (Automatic Gain Control)
       - 自动增益控制算法，可以动态调整输出音频的幅值，当弱信号输入时，放大输出幅度；当输入信号达到一定强度时，压缩输出幅度。
     * - WakeNet
-      - 基于神经网络的唤醒词模型，专为低功耗潜入式 MCU 设计
+      - 基于神经网络的唤醒词模型，专为低功耗嵌入式 MCU 设计
 
 使用场景
 --------
@@ -319,7 +319,7 @@ AEC 的使用和 WakeNet 相似，用户可以根据自己的需求来停止或
     feed 音频数据
     ^^^^^^^^^^^^^
 
-    在初始化 AFE 完成后，使用 :cpp:func: `feed` 函数，将音频数据输入到 AFE 模块中进行处理。输入音频的格式详见 :ref:`input-audio-1` 。
+    在初始化 AFE 完成后，使用 :cpp:func:`feed` 函数，将音频数据输入到 AFE 模块中进行处理。输入音频的格式详见 :ref:`input-audio-1` 。
 
     ::
 
@@ -405,3 +405,9 @@ AEC 的使用和 WakeNet 相似，用户可以根据自己的需求来停止或
             int ret_value;                          // the return state of fetch function
             void* reserved;                         // reserved for future use
         } afe_fetch_result_t;
+
+
+资源消耗
+--------
+
+有关本模型的资源消耗情况，请见 :doc:`资源消耗 <../benchmark/README>`。
\ No newline at end of file
diff --git a/docs/zh_CN/benchmark/README.rst b/docs/zh_CN/benchmark/README.rst
index 39fc110..a212f53 100644
--- a/docs/zh_CN/benchmark/README.rst
+++ b/docs/zh_CN/benchmark/README.rst
@@ -12,7 +12,7 @@ AFE
 .. only:: esp32
 
     +-----------------+-----------------+-----------------+-----------------+
-    | algorithm Type  | RAM             | Average cpu     | Frame Length    |
+    | Algorithm Type  | RAM             | Average cpu     | Frame Length    |
     |                 |                 | loading(compute |                 |
     |                 |                 | with 2 cores)   |                 |
     +=================+=================+=================+=================+
@@ -26,7 +26,7 @@ AFE
 .. only:: esp32s3
 
     +-----------------+-----------------+-----------------+-----------------+
-    | algorithm Type  | RAM             | Average cpu     | Frame Length    |
+    | Algorithm Type  | RAM             | Average cpu     | Frame Length    |
     |                 |                 | loading(compute |                 |
     |                 |                 | with 2 cores)   |                 |
     +=================+=================+=================+=================+
@@ -155,16 +155,38 @@ MultiNet
 性能测试
 ~~~~~~~~
 
-+-----------+-----------+-----------+-----------+-----------+
-| Model     | Distance  | Quiet     | S         | Speech    |
-| Type      |           |           | tationary | Noise     |
-|           |           |           | Noise     | (SNR = 4  |
-|           |           |           | (SNR = 4  | dB)       |
-|           |           |           | dB)       |           |
-+===========+===========+===========+===========+===========+
-| MultiNet  | 3 m       | 98%       | 93%       | 92%       |
-| 4         |           |           |           |           |
-+-----------+-----------+-----------+-----------+-----------+
-| MultiNet  | 3 m       | 94%       | 92%       | 91%       |
-| 4 Q8      |           |           |           |           |
-+-----------+-----------+-----------+-----------+-----------+
\ No newline at end of file
++-----------+-----------+----------+------------+-----------+
+| Model     | Distance  | Quiet    | Stationary | Speech    |
+| Type      |           |          | Noise      | Noise     |
+|           |           |          | (SNR = 4   | (SNR = 4  |
+|           |           |          | dB)        | dB)       |
++===========+===========+==========+============+===========+
+| MultiNet  | 3 m       | 98%      | 93%        | 92%       |
+| 4         |           |          |            |           |
++-----------+-----------+----------+------------+-----------+
+| MultiNet  | 3 m       | 94%      | 92%        | 91%       |
+| 4 Q8      |           |          |            |           |
++-----------+-----------+----------+------------+-----------+
+
+
+TTS
+---
+
+资源占用
+~~~~~~~~
+
+Flash image size: 2.2 MB
+
+RAM runtime: 20 KB
+
+
+性能测试
+~~~~~~~~
+
+CPU 负载测试（ESP32 @240 MHz）：
+
++------------------------------+------+------+------+------+------+------+
+| Speech Rate                  | 0    | 1    | 2    | 3    | 4    | 5    |
++==============================+======+======+======+======+======+======+
+| Times faster than real time  | 4.5  | 3.2  | 2.9  | 2.5  | 2.2  | 1.8  |
++------------------------------+------+------+------+------+------+------+
\ No newline at end of file
diff --git a/docs/zh_CN/getting_started/readme.rst b/docs/zh_CN/getting_started/readme.rst
index b3517af..ec9f2ab 100644
--- a/docs/zh_CN/getting_started/readme.rst
+++ b/docs/zh_CN/getting_started/readme.rst
@@ -12,8 +12,8 @@ ESP-SR 支持以下模块：
 
 * :doc:`声学前端算法 AFE <../audio_front_end/README>`
 * :doc:`唤醒词检测 WakeNet <../wake_word_engine/README>`
-* :doc:`命令词识别 MultiNet<../speech_command_recognition/README>`
-* 语音合成（目前只支持中文）
+* :doc:`命令词识别 MultiNet <../speech_command_recognition/README>`
+* :doc:`语音合成（目前只支持中文）<../speech_synthesis/readme>`
 
 准备工作
 --------
diff --git a/docs/zh_CN/index.rst b/docs/zh_CN/index.rst
index c42d032..f3584c3 100644
--- a/docs/zh_CN/index.rst
+++ b/docs/zh_CN/index.rst
@@ -18,6 +18,7 @@ ESP-SR 用户指南
     AFE 声学前端算法 <audio_front_end/index>
     语音唤醒 WakeNet <wake_word_engine/index>
     语音指令 MultiNet <speech_command_recognition/README>
+    语音合成（仅支持中文）<speech_synthesis/readme>
     模型加载 <flash_model/README>
     资源消耗 <benchmark/README>
     测试报告 <test_report/README>
diff --git a/docs/zh_CN/speech_command_recognition/README.rst b/docs/zh_CN/speech_command_recognition/README.rst
index 4991cc8..f1cbc7b 100644
--- a/docs/zh_CN/speech_command_recognition/README.rst
+++ b/docs/zh_CN/speech_command_recognition/README.rst
@@ -228,6 +228,12 @@ MultiNet 命令词识别支持两种基本模式：
 当命令词识别返回状态为 ``ESP_MN_STATE_DETECTED`` 时退出命令词识别，则为单次识别模式；
 当命令词识别返回状态为 ``ESP_MN_STATE_TIMEOUT`` 时退出命令词识别，则为连续识别模式；
 
+
+资源消耗
+--------
+
+有关本模型的资源消耗情况，请见 :doc:`资源消耗 <../benchmark/README>`。
+
 其他配置和使用
 --------------
 
diff --git a/docs/zh_CN/speech_synthesis/readme.rst b/docs/zh_CN/speech_synthesis/readme.rst
new file mode 100644
index 0000000..73ed856
--- /dev/null
+++ b/docs/zh_CN/speech_synthesis/readme.rst
@@ -0,0 +1,73 @@
+TTS 语音合成模型
+================
+
+:link_to_translation:`en:[English]`
+
+乐鑫 TTS 语音合成模型是一个为嵌入式系统设计的轻量化语音合成系统，具有如下主要特性：
+
+- 目前 **仅支持中文**
+- 输入文本采用 UTF-8 编码
+- 输出格式采用流输出，可减少延时
+- 多音词发音自动识别
+- 可调节合成语速
+- 数字播报优化
+- 自定义声音集（敬请期待）
+
+简介
+----
+
+乐鑫 TTS 的当前版本基于拼接法，主要组成部分包括：
+
+- 解析器 (Parser)：根据字典与语法规则，将输入文本（采用 UTF-8 编码）转换为拼音列表。
+- 合成器 (Synthesizer)：根据解析器输出的拼音列表，结合预定义的声音集，合成波形文件。默认输出格式为：单声道，16 bit @ 16000Hz。
+
+系统框图如下：
+
+.. figure:: ../../_static/esp_chinese_tts.png
+   :alt: chinese TTS
+
+简单示例
+--------
+
+- :project_file:`esp-tts/samples/xiaoxin_speed1.wav` (voice=xiaoxin, speed=1)：欢迎使用乐鑫语音合成，支付宝收款 72.1 元，微信收款 643.12 元，扫码收款 5489.54 元
+- :project_file:`esp-tts/samples/S2_xiaole_speed2.wav` (voice=xiaole, speed=2)： 支付宝收款 1111.11 元
+
+编程指南
+--------
+
+.. code:: c
+
+   #include "esp_tts.h"
+   #include "esp_tts_voice_female.h"
+   #include "esp_partition.h"
+
+   /*** 1. create esp tts handle  ***/
+
+
+   // initial voice set from separate voice data partition
+
+   const esp_partition_t* part=esp_partition_find_first(ESP_PARTITION_TYPE_DATA, ESP_PARTITION_SUBTYPE_DATA_FAT, "voice_data");
+   if (part==0) printf("Couldn't find voice data partition!\n");
+   spi_flash_mmap_handle_t mmap;
+   uint16_t* voicedata;
+   esp_err_t err=esp_partition_mmap(part, 0, part->size, SPI_FLASH_MMAP_DATA, (const void**)&voicedata, &mmap);
+   esp_tts_voice_t *voice=esp_tts_voice_set_init(&esp_tts_voice_template, voicedata);
+
+   // 2. parse text and synthesis wave data
+   char *text="欢迎使用乐鑫语音合成";
+   if (esp_tts_parse_chinese(tts_handle, text)) {  // parse text into pinyin list
+       int len[1]={0};
+       do {
+           short *data=esp_tts_stream_play(tts_handle, len, 4); // streaming synthesis
+           i2s_audio_play(data, len[0]*2, portMAX_DELAY);  // i2s output
+       } while(len[0]>0);
+       i2s_zero_dma_buffer(0);
+   }
+
+更多参考，请前往 :project_file:`esp-tts/esp_tts_chinese/include/esp_tts.h` 查看 API 定义，或参考 ESP-Skainet 中 `chinese_tts <https://github.com/espressif/esp-skainet/tree/master/examples/chinese_tts>`__ 示例.
+
+
+资源消耗
+--------
+
+有关本模型的资源消耗情况，请见 :doc:`资源消耗 <../benchmark/README>`。
\ No newline at end of file
diff --git a/docs/zh_CN/wake_word_engine/README.rst b/docs/zh_CN/wake_word_engine/README.rst
index a602bb1..151e38b 100644
--- a/docs/zh_CN/wake_word_engine/README.rst
+++ b/docs/zh_CN/wake_word_engine/README.rst
@@ -101,4 +101,4 @@ WakeNet 的使用
 资源消耗
 --------
 
-具体请参考 :doc:`资源消耗 <../benchmark/README>` 。
\ No newline at end of file
+有关本模型的资源消耗情况，请见 :doc:`资源消耗 <../benchmark/README>`。
\ No newline at end of file
diff --git a/esp-tts/README.md b/esp-tts/README.md
index 0d1547b..d612f28 100644
--- a/esp-tts/README.md
+++ b/esp-tts/README.md
@@ -1,85 +1,3 @@
-# ESP Chinese TTS [[English]](./README_en.md) 
+# ESP Chinese TTS
 
-乐鑫中文语音合成是一个为嵌入式系统设计的轻量化语音合成系统。
-
-## Overview
-
-乐鑫语音合成当前版本基于拼接法，系统框图如下：
-
-![chinese TTS](./img/esp_chinese_tts.png)
-
-- Parser: 根据字典与语法规则，将输入文本转换为拼音列表, 输入文本编码为UTF-8。
-- Synthesizer: 根据Parser输出的拼音列表，结合预定义的声音集，合成波形文件。默认输出格式为单声道， 16bit@16000Hz。
-
-#### Features：
-
-- [x] UTF-8编码输入
-
-- [x] 流式输出，减少延时
-
-- [x] 多音词发音自动识别
-
-- [x] 可调节合成语速
-
-- [x] 数字播报优化
-
-- [ ] 自定义声音集
-
-
-
-## Performance Test
-
-#### Resource Occupancy
-
-Flash image size： 2.2 MB
-
-RAM runtime: 20 KB
-
-CPU loading test（基于ESP32 @ 240MHz测试 ）:
-
-| speech rate                 |  0   |  1   |  2   |  3   |  4   |  5   |
-| --------------------------- | :--: | :--: | :--: | :--: | :--: | :--: |
-| times faster than real time | 4.5  | 3.2  |  2.9 | 2.5  | 2.2  | 1.8  |
-
-#### Samples
-
-
-- 欢迎使用乐鑫语音合成, 支付宝收款72.10元，微信收款643.12元，扫码收款5489.54元, &nbsp; &nbsp; [voice=xiaoxin,speed=1](./samples/xiaoxin_speed1.wav)
-- 支付宝收款 1111.11 元, &nbsp; &nbsp;  [voice=xiaole,speed=2](./samples/S2_xiaole_speed2.wav) 
-
-
-   
-
-## User Guide
-
-```c
-#include "esp_tts.h"
-#include "esp_tts_voice_female.h"
-#include "esp_partition.h"
-
-/*** 1. create esp tts handle  ***/
-
-
-// initial voice set from separate voice data partition
-
-const esp_partition_t* part=esp_partition_find_first(ESP_PARTITION_TYPE_DATA, ESP_PARTITION_SUBTYPE_DATA_FAT, "voice_data");
-if (part==0) printf("Couldn't find voice data partition!\n");
-spi_flash_mmap_handle_t mmap;
-uint16_t* voicedata;
-esp_err_t err=esp_partition_mmap(part, 0, part->size, SPI_FLASH_MMAP_DATA, (const void**)&voicedata, &mmap);
-esp_tts_voice_t *voice=esp_tts_voice_set_init(&esp_tts_voice_template, voicedata); 
-
-// 2. parse text and synthesis wave data
-char *text="欢迎使用乐鑫语音合成";	
-if (esp_tts_parse_chinese(tts_handle, text)) {  // parse text into pinyin list
-	int len[1]={0};
-	do {
-		short *data=esp_tts_stream_play(tts_handle, len, 4); // streaming synthesis
-		i2s_audio_play(data, len[0]*2, portMAX_DELAY);  // i2s output             
-	} while(len[0]>0);
-	i2s_zero_dma_buffer(0);
-}
-
-```
-
-更多请参考[esp_tts.h](./esp_tts_chinese/include/esp_tts.h)查看API定义, 或参考esp-skainet中[chinese_tts](https://github.com/espressif/esp-skainet/tree/master/examples/chinese_tts)示例.
+Espressif TTS speech synthesis model is a lightweight speech synthesis system designed for embedded systems. Currently, only the Chinese language is supported. See more documentation [Here](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/speech_synthesis/readme.html).
\ No newline at end of file
diff --git a/esp-tts/README_en.md b/esp-tts/README_en.md
deleted file mode 100644
index 2e11f55..0000000
--- a/esp-tts/README_en.md
+++ /dev/null
@@ -1,92 +0,0 @@
-## ESP Chinese TTS [[中文]](./README.md)
-
-Espressif Chinese TTS  is a lightweight TTS system designed for embedded systems。
-
-## Overview
-
-The Chinese TTS is based on concatenative  method. The flow diagram of system is as follows:
-
-![chinese TTS](./img/esp_chinese_tts.png)
-
-- **Parser** : a Chinese grapheme to phoneme module,  input text (UTF-8) and output Chinese pinyin list. 
-- **Synthesizer** : a concatenative synthesizer, input pinyin list and output wave raw data. The default encoding of raw data is mono, 16 bit@16000 Hz.
-
-####  Features
-
-- [x] UTF-8 encoding text input
-
-- [x] Streaming output
-
-- [x] Polyphonic pronunciation
-
-- [x] Adjustable speech rate
-
-- [x] Digital broadcasting optimization
-
-- [ ] Custom sound set
-
-
-
-## Performance Test
-
-#### Resource Occupancy
-
-Flash image size： 2.2 MB
-
-RAM runtime: 20 KB
-
-CPU loading test（ESP32 @ 240 MHz）:
-
-| speech rate                 |  0   |  1   |  2   |  3   |  4   |  5   |
-| --------------------------- | :--: | :--: | :--: | :--: | :--: | :--: |
-| times faster than real time | 4.5  | 3.2  |  2.9 | 2.5  | 2.2  | 1.8  |
-
-**Note:** the bigger rate, the faster speech speed. 0: slowest speaking speed, 5: fastest speaking speed.
-
-#### Samples
-
-- 欢迎使用乐鑫语音合成, &nbsp; &nbsp; [voice=小乐,speed=0](./samples/S1_xiaole_speed0.wav), &nbsp; &nbsp;  [voice=小乐,speed=2](./samples/S1_xiaole_speed2.wav) 
-
-- 支付宝收款 1111.11 元, &nbsp; &nbsp;  [voice=小乐,speed=0](./samples/S1_xiaole_speed0.wav), &nbsp; &nbsp;  [voice=小乐,speed=2](./samples/S2_xiaole_speed2.wav) 
-
-- 空调制热模式已打开，并调节到25度, &nbsp; &nbsp;  [voice=小乐,speed=0](./samples/S3_xiaole_speed0.wav), &nbsp; &nbsp;   [voice=小乐,speed=4](./samples/S3_xiaole_speed4.wav) 
-
-## User Guide
-
-```c
-#include "esp_tts.h"
-#include "esp_tts_voice_female.h"
-#include "esp_partition.h"
-
-/*** 1. create esp tts handle  ***/
-
-//// Method1: use pre-define xiaole voice lib.
-//// This method is not recommended because the method may make app bin exceed the limit of esp32
-// esp_tts_handle_t *tts_handle=esp_tts_create(esp_tts_voice_female);
-
-  
-// method2: initial voice set from separate voice data partition
-
-const esp_partition_t* part=esp_partition_find_first(ESP_PARTITION_TYPE_DATA, ESP_PARTITION_SUBTYPE_DATA_FAT, "voice_data");
-if (part==0) printf("Couldn't find voice data partition!\n");
-spi_flash_mmap_handle_t mmap;
-uint16_t* voicedata;
-esp_err_t err=esp_partition_mmap(part, 0, 3*1024*1024, SPI_FLASH_MMAP_DATA, (const void**)&voicedata, &mmap);
-esp_tts_voice_t *voice=esp_tts_voice_set_init(&esp_tts_voice_template, voicedata); 
-
-// 2. parse text and synthesis wave data
-char *text="欢迎使用乐鑫语音合成";	
-if (esp_tts_parse_chinese(tts_handle, text)) {  // parse text into pinyin list
-	int len[1]={0};
-	do {
-		short *data=esp_tts_stream_play(tts_handle, len, 4); // streaming synthesis
-		i2s_audio_play(data, len[0]*2, portMAX_DELAY);  // i2s output             
-	} while(len[0]>0);
-	i2s_zero_dma_buffer(0);
-}
-
-```
-
-please refer to [esp_tts.h](./esp_tts_chinese/include/esp_tts.h) for the details of API or [chinese_tts](../../examples/chinese_tts) example in esp-skainet.
-
-