From a9efcbb4d59919bf0ae7dc570a6da0acf9a95962 Mon Sep 17 00:00:00 2001 From: "liying@espressif.com" Date: Tue, 17 Jan 2023 11:11:07 +0800 Subject: [PATCH] fix doc menu add tts into doc folder add documentation feedback id --- .../img => docs/_static}/esp_chinese_tts.png | Bin docs/conf_common.py | 2 +- docs/en/_templates/layout.html | 2 +- docs/en/audio_front_end/README.rst | 8 +- docs/en/benchmark/README.rst | 52 +++++++--- docs/en/flash_model/README.rst | 2 +- docs/en/getting_started/readme.rst | 2 +- docs/en/index.rst | 1 + docs/en/speech_command_recognition/README.rst | 6 ++ docs/en/speech_synthesis/readme.rst | 74 ++++++++++++++ docs/en/wake_word_engine/README.rst | 6 +- docs/zh_CN/_templates/layout.html | 2 +- docs/zh_CN/audio_front_end/README.rst | 10 +- docs/zh_CN/benchmark/README.rst | 52 +++++++--- docs/zh_CN/getting_started/readme.rst | 4 +- docs/zh_CN/index.rst | 1 + .../speech_command_recognition/README.rst | 6 ++ docs/zh_CN/speech_synthesis/readme.rst | 73 ++++++++++++++ docs/zh_CN/wake_word_engine/README.rst | 2 +- esp-tts/README.md | 86 +--------------- esp-tts/README_en.md | 92 ------------------ 21 files changed, 263 insertions(+), 220 deletions(-) rename {esp-tts/img => docs/_static}/esp_chinese_tts.png (100%) create mode 100644 docs/en/speech_synthesis/readme.rst create mode 100644 docs/zh_CN/speech_synthesis/readme.rst delete mode 100644 esp-tts/README_en.md diff --git a/esp-tts/img/esp_chinese_tts.png b/docs/_static/esp_chinese_tts.png similarity index 100% rename from esp-tts/img/esp_chinese_tts.png rename to docs/_static/esp_chinese_tts.png diff --git a/docs/conf_common.py b/docs/conf_common.py index b23d2e2..dd467b4 100755 --- a/docs/conf_common.py +++ b/docs/conf_common.py @@ -21,7 +21,7 @@ project_slug = 'esp-sr' # Contains info used for constructing target and version selector # Can also be hosted externally, see esp-idf for example -versions_url = '_static/docs_version.js' +versions_url = './_static/docs_version.js' # Final PDF filename will contains target and version pdf_file_prefix = u'esp-sr' diff --git a/docs/en/_templates/layout.html b/docs/en/_templates/layout.html index df6a184..5e7afe9 100644 --- a/docs/en/_templates/layout.html +++ b/docs/en/_templates/layout.html @@ -1,4 +1,4 @@ {% extends '!layout.html' %} {% block comments %} -

Provide feedback about this document

+

Provide feedback about this document

{% endblock %} diff --git a/docs/en/audio_front_end/README.rst b/docs/en/audio_front_end/README.rst index 05a931b..48969ec 100644 --- a/docs/en/audio_front_end/README.rst +++ b/docs/en/audio_front_end/README.rst @@ -404,4 +404,10 @@ The usage of AEC is similar to that of WakeNet. Users can disable or enable AEC int wake_word_length; // the length of wake word. It's unit is the number of samples. int ret_value; // the return state of fetch function void* reserved; // reserved for future use - } afe_fetch_result_t; \ No newline at end of file + } afe_fetch_result_t; + + +Resource Occupancy +------------------ + +For the resource occupancy for this model, see :doc:`Resource Occupancy <../benchmark/README>`. \ No newline at end of file diff --git a/docs/en/benchmark/README.rst b/docs/en/benchmark/README.rst index 5637e26..a335d64 100644 --- a/docs/en/benchmark/README.rst +++ b/docs/en/benchmark/README.rst @@ -12,7 +12,7 @@ Resource Occupancy .. only:: esp32 +-----------------+-----------------+-----------------+-----------------+ - | algorithm Type | RAM | Average cpu | Frame Length | + | Algorithm Type | RAM | Average cpu | Frame Length | | | | loading(compute | | | | | with 2 cores) | | +=================+=================+=================+=================+ @@ -26,7 +26,7 @@ Resource Occupancy .. only:: esp32s3 +-----------------+-----------------+-----------------+-----------------+ - | algorithm Type | RAM | Average cpu | Frame Length | + | Algorithm Type | RAM | Average cpu | Frame Length | | | | loading(compute | | | | | with 2 cores) | | +=================+=================+=================+=================+ @@ -155,16 +155,38 @@ Resource Occupancy Performance Test ~~~~~~~~~~~~~~~~ -+-----------+-----------+-----------+-----------+-----------+ -| Model | Distance | Quiet | S | Speech | -| Type | | | tationary | Noise | -| | | | Noise | (SNR = 4 | -| | | | (SNR = 4 | dB) | -| | | | dB) | | -+===========+===========+===========+===========+===========+ -| MultiNet | 3 m | 98% | 93% | 92% | -| 4 | | | | | -+-----------+-----------+-----------+-----------+-----------+ -| MultiNet | 3 m | 94% | 92% | 91% | -| 4 Q8 | | | | | -+-----------+-----------+-----------+-----------+-----------+ \ No newline at end of file ++-----------+-----------+----------+------------+-----------+ +| Model | Distance | Quiet | Stationary | Speech | +| Type | | | Noise | Noise | +| | | | (SNR = 4 | (SNR = 4 | +| | | | dB) | dB) | ++===========+===========+==========+============+===========+ +| MultiNet | 3 m | 98% | 93% | 92% | +| 4 | | | | | ++-----------+-----------+----------+------------+-----------+ +| MultiNet | 3 m | 94% | 92% | 91% | +| 4 Q8 | | | | | ++-----------+-----------+----------+------------+-----------+ + + +TTS +--- + +Resource Occupancy +~~~~~~~~~~~~~~~~~~ + +Flash image size: 2.2 MB + +RAM runtime: 20 KB + + +Performance Test +~~~~~~~~~~~~~~~~ + +CPU loading test (ESP32 @240 MHz): + ++------------------------------+------+------+------+------+------+------+ +| Speech Rate | 0 | 1 | 2 | 3 | 4 | 5 | ++==============================+======+======+======+======+======+======+ +| Times faster than real time | 4.5 | 3.2 | 2.9 | 2.5 | 2.2 | 1.8 | ++------------------------------+------+------+------+------+------+------+ \ No newline at end of file diff --git a/docs/en/flash_model/README.rst b/docs/en/flash_model/README.rst index 594d51f..b245380 100644 --- a/docs/en/flash_model/README.rst +++ b/docs/en/flash_model/README.rst @@ -18,7 +18,7 @@ To use our models in your project, you need to flash these models. Currently, ES - Load directly from SIP Flash File System (SPIFFS) - Load from external SD card - So that on ESP32S3 you can: + So that on ESP32-S3 you can: - Greatly reduce the size of the user application APP BIN - Supports the selection of up to two wake words diff --git a/docs/en/getting_started/readme.rst b/docs/en/getting_started/readme.rst index 366ae6b..0e11712 100644 --- a/docs/en/getting_started/readme.rst +++ b/docs/en/getting_started/readme.rst @@ -13,7 +13,7 @@ ESP-SR includes the following modules: * :doc:`Audio Front-end AFE <../audio_front_end/README>` * :doc:`Wake Word Engine WakeNet <../wake_word_engine/README>` * :doc:`Speech Command Word Recognition MultiNet <../speech_command_recognition/README>` -* Speech Synthesis (only supports Chinese language) +* :doc:`Speech Synthesis (only supports Chinese language) <../speech_synthesis/readme>` What You Need ------------- diff --git a/docs/en/index.rst b/docs/en/index.rst index 00d0480..2e0b19f 100644 --- a/docs/en/index.rst +++ b/docs/en/index.rst @@ -17,6 +17,7 @@ ESP-SR User Guide Audio Front-end (AFE) Wake Word WakeNet Speech Command Word MultiNet + Speech Synthesis (Only Supports Chinese Language) Flashing Models Resource Overhead Test Report diff --git a/docs/en/speech_command_recognition/README.rst b/docs/en/speech_command_recognition/README.rst index 935cb53..44c59f5 100644 --- a/docs/en/speech_command_recognition/README.rst +++ b/docs/en/speech_command_recognition/README.rst @@ -228,6 +228,12 @@ Therefore: * Single recognition mode: exit the speech recognition when the return status is ``ESP_MN_STATE_DETECTED`` * Continuous recognition: exit the speech recognition when the return status is ``ESP_MN_STATE_TIMEOUT`` +Resource Occupancy +------------------ + +For the resource occupancy for this model, see :doc:`Resource Occupancy <../benchmark/README>`. + + Other configurations ----------------------- diff --git a/docs/en/speech_synthesis/readme.rst b/docs/en/speech_synthesis/readme.rst new file mode 100644 index 0000000..12a3b7a --- /dev/null +++ b/docs/en/speech_synthesis/readme.rst @@ -0,0 +1,74 @@ +TTS Speech Synthesis Model +========================== + +:link_to_translation:`zh_CN:[中文]` + +Espressif TTS speech synthesis model is a lightweight speech synthesis system designed for embedded systems, with the following main features: + +- Currently **Only supports Chinese language** +- Input text is encoded in UTF-8 +- Streaming output, which reduces latency +- Polyphonic pronunciation +- Adjustable output speech rate +- Digital broadcasting optimization +- Customized sound set (coming soon) + +Overview +-------- + +Using a concatenative method, the current version of TTS includes the following components: + +- Parser: converts Chinese text (encoded in UTF-8) to phonemes. +- Synthesizer: generates wave raw data from the phonemes provided by the parser and the sound set. Default output format: mono, 16 bit @ 16000 Hz. + +Workflow: + +.. figure:: ../../_static/esp_chinese_tts.png + :alt: chinese TTS + +Examples +-------- + +- :project_file:`esp-tts/samples/xiaoxin_speed1.wav` (voice=xiaoxin, speed=1): 欢迎使用乐鑫语音合成,支付宝收款 72.1 元,微信收款 643.12 元,扫码收款 5489.54 元 +- :project_file:`esp-tts/samples/S2_xiaole_speed2.wav` (voice=xiaole, speed=2): 支付宝收款 1111.11 元 + +Programming Procedures +---------------------- + +.. code:: c + + #include "esp_tts.h" + #include "esp_tts_voice_female.h" + #include "esp_partition.h" + + /*** 1. create esp tts handle ***/ + + + // initial voice set from separate voice data partition + + const esp_partition_t* part=esp_partition_find_first(ESP_PARTITION_TYPE_DATA, ESP_PARTITION_SUBTYPE_DATA_FAT, "voice_data"); + if (part==0) printf("Couldn't find voice data partition!\n"); + spi_flash_mmap_handle_t mmap; + uint16_t* voicedata; + esp_err_t err=esp_partition_mmap(part, 0, part->size, SPI_FLASH_MMAP_DATA, (const void**)&voicedata, &mmap); + esp_tts_voice_t *voice=esp_tts_voice_set_init(&esp_tts_voice_template, voicedata); + + // 2. parse text and synthesis wave data + char *text="欢迎使用乐鑫语音合成"; + if (esp_tts_parse_chinese(tts_handle, text)) { // parse text into pinyin list + int len[1]={0}; + do { + short *data=esp_tts_stream_play(tts_handle, len, 4); // streaming synthesis + i2s_audio_play(data, len[0]*2, portMAX_DELAY); // i2s output + } while(len[0]>0); + i2s_zero_dma_buffer(0); + } + + +See :project_file:`esp-tts/esp_tts_chinese/include/esp_tts.h` for API reference and see the `chinese_tts `__ example in ESP-Skainet. + + +Resource Occupancy +------------------ + +For the resource occupancy for this model, see :doc:`Resource Occupancy <../benchmark/README>`. \ No newline at end of file diff --git a/docs/en/wake_word_engine/README.rst b/docs/en/wake_word_engine/README.rst index 1d750a1..9979f2b 100644 --- a/docs/en/wake_word_engine/README.rst +++ b/docs/en/wake_word_engine/README.rst @@ -98,7 +98,7 @@ Use WakeNet afe_handle->disable_wakenet(afe_data) afe_handle->enable_wakenet(afe_data) -Resource Consumption --------------------- +Resource Occupancy +------------------ -Please refer to :doc:`Resource Consumption <../benchmark/README>` . \ No newline at end of file +For the resource occupancy for this model, see :doc:`Resource Occupancy <../benchmark/README>`. \ No newline at end of file diff --git a/docs/zh_CN/_templates/layout.html b/docs/zh_CN/_templates/layout.html index de50eda..f778c98 100644 --- a/docs/zh_CN/_templates/layout.html +++ b/docs/zh_CN/_templates/layout.html @@ -1,4 +1,4 @@ {% extends '!layout.html' %} {% block comments %} -

提供有关此文档的反馈

+

提供有关此文档的反馈

{% endblock %} diff --git a/docs/zh_CN/audio_front_end/README.rst b/docs/zh_CN/audio_front_end/README.rst index 847fe2d..b64c357 100644 --- a/docs/zh_CN/audio_front_end/README.rst +++ b/docs/zh_CN/audio_front_end/README.rst @@ -27,7 +27,7 @@ AEF 声学前端算法框架 * - AGC (Automatic Gain Control) - 自动增益控制算法,可以动态调整输出音频的幅值,当弱信号输入时,放大输出幅度;当输入信号达到一定强度时,压缩输出幅度。 * - WakeNet - - 基于神经网络的唤醒词模型,专为低功耗潜入式 MCU 设计 + - 基于神经网络的唤醒词模型,专为低功耗嵌入式 MCU 设计 使用场景 -------- @@ -319,7 +319,7 @@ AEC 的使用和 WakeNet 相似,用户可以根据自己的需求来停止或 feed 音频数据 ^^^^^^^^^^^^^ - 在初始化 AFE 完成后,使用 :cpp:func: `feed` 函数,将音频数据输入到 AFE 模块中进行处理。输入音频的格式详见 :ref:`input-audio-1` 。 + 在初始化 AFE 完成后,使用 :cpp:func:`feed` 函数,将音频数据输入到 AFE 模块中进行处理。输入音频的格式详见 :ref:`input-audio-1` 。 :: @@ -405,3 +405,9 @@ AEC 的使用和 WakeNet 相似,用户可以根据自己的需求来停止或 int ret_value; // the return state of fetch function void* reserved; // reserved for future use } afe_fetch_result_t; + + +资源消耗 +-------- + +有关本模型的资源消耗情况,请见 :doc:`资源消耗 <../benchmark/README>`。 \ No newline at end of file diff --git a/docs/zh_CN/benchmark/README.rst b/docs/zh_CN/benchmark/README.rst index 39fc110..a212f53 100644 --- a/docs/zh_CN/benchmark/README.rst +++ b/docs/zh_CN/benchmark/README.rst @@ -12,7 +12,7 @@ AFE .. only:: esp32 +-----------------+-----------------+-----------------+-----------------+ - | algorithm Type | RAM | Average cpu | Frame Length | + | Algorithm Type | RAM | Average cpu | Frame Length | | | | loading(compute | | | | | with 2 cores) | | +=================+=================+=================+=================+ @@ -26,7 +26,7 @@ AFE .. only:: esp32s3 +-----------------+-----------------+-----------------+-----------------+ - | algorithm Type | RAM | Average cpu | Frame Length | + | Algorithm Type | RAM | Average cpu | Frame Length | | | | loading(compute | | | | | with 2 cores) | | +=================+=================+=================+=================+ @@ -155,16 +155,38 @@ MultiNet 性能测试 ~~~~~~~~ -+-----------+-----------+-----------+-----------+-----------+ -| Model | Distance | Quiet | S | Speech | -| Type | | | tationary | Noise | -| | | | Noise | (SNR = 4 | -| | | | (SNR = 4 | dB) | -| | | | dB) | | -+===========+===========+===========+===========+===========+ -| MultiNet | 3 m | 98% | 93% | 92% | -| 4 | | | | | -+-----------+-----------+-----------+-----------+-----------+ -| MultiNet | 3 m | 94% | 92% | 91% | -| 4 Q8 | | | | | -+-----------+-----------+-----------+-----------+-----------+ \ No newline at end of file ++-----------+-----------+----------+------------+-----------+ +| Model | Distance | Quiet | Stationary | Speech | +| Type | | | Noise | Noise | +| | | | (SNR = 4 | (SNR = 4 | +| | | | dB) | dB) | ++===========+===========+==========+============+===========+ +| MultiNet | 3 m | 98% | 93% | 92% | +| 4 | | | | | ++-----------+-----------+----------+------------+-----------+ +| MultiNet | 3 m | 94% | 92% | 91% | +| 4 Q8 | | | | | ++-----------+-----------+----------+------------+-----------+ + + +TTS +--- + +资源占用 +~~~~~~~~ + +Flash image size: 2.2 MB + +RAM runtime: 20 KB + + +性能测试 +~~~~~~~~ + +CPU 负载测试(ESP32 @240 MHz): + ++------------------------------+------+------+------+------+------+------+ +| Speech Rate | 0 | 1 | 2 | 3 | 4 | 5 | ++==============================+======+======+======+======+======+======+ +| Times faster than real time | 4.5 | 3.2 | 2.9 | 2.5 | 2.2 | 1.8 | ++------------------------------+------+------+------+------+------+------+ \ No newline at end of file diff --git a/docs/zh_CN/getting_started/readme.rst b/docs/zh_CN/getting_started/readme.rst index b3517af..ec9f2ab 100644 --- a/docs/zh_CN/getting_started/readme.rst +++ b/docs/zh_CN/getting_started/readme.rst @@ -12,8 +12,8 @@ ESP-SR 支持以下模块: * :doc:`声学前端算法 AFE <../audio_front_end/README>` * :doc:`唤醒词检测 WakeNet <../wake_word_engine/README>` -* :doc:`命令词识别 MultiNet<../speech_command_recognition/README>` -* 语音合成(目前只支持中文) +* :doc:`命令词识别 MultiNet <../speech_command_recognition/README>` +* :doc:`语音合成(目前只支持中文)<../speech_synthesis/readme>` 准备工作 -------- diff --git a/docs/zh_CN/index.rst b/docs/zh_CN/index.rst index c42d032..f3584c3 100644 --- a/docs/zh_CN/index.rst +++ b/docs/zh_CN/index.rst @@ -18,6 +18,7 @@ ESP-SR 用户指南 AFE 声学前端算法 语音唤醒 WakeNet 语音指令 MultiNet + 语音合成(仅支持中文) 模型加载 资源消耗 测试报告 diff --git a/docs/zh_CN/speech_command_recognition/README.rst b/docs/zh_CN/speech_command_recognition/README.rst index 4991cc8..f1cbc7b 100644 --- a/docs/zh_CN/speech_command_recognition/README.rst +++ b/docs/zh_CN/speech_command_recognition/README.rst @@ -228,6 +228,12 @@ MultiNet 命令词识别支持两种基本模式: 当命令词识别返回状态为 ``ESP_MN_STATE_DETECTED`` 时退出命令词识别,则为单次识别模式; 当命令词识别返回状态为 ``ESP_MN_STATE_TIMEOUT`` 时退出命令词识别,则为连续识别模式; + +资源消耗 +-------- + +有关本模型的资源消耗情况,请见 :doc:`资源消耗 <../benchmark/README>`。 + 其他配置和使用 -------------- diff --git a/docs/zh_CN/speech_synthesis/readme.rst b/docs/zh_CN/speech_synthesis/readme.rst new file mode 100644 index 0000000..73ed856 --- /dev/null +++ b/docs/zh_CN/speech_synthesis/readme.rst @@ -0,0 +1,73 @@ +TTS 语音合成模型 +================ + +:link_to_translation:`en:[English]` + +乐鑫 TTS 语音合成模型是一个为嵌入式系统设计的轻量化语音合成系统,具有如下主要特性: + +- 目前 **仅支持中文** +- 输入文本采用 UTF-8 编码 +- 输出格式采用流输出,可减少延时 +- 多音词发音自动识别 +- 可调节合成语速 +- 数字播报优化 +- 自定义声音集(敬请期待) + +简介 +---- + +乐鑫 TTS 的当前版本基于拼接法,主要组成部分包括: + +- 解析器 (Parser):根据字典与语法规则,将输入文本(采用 UTF-8 编码)转换为拼音列表。 +- 合成器 (Synthesizer):根据解析器输出的拼音列表,结合预定义的声音集,合成波形文件。默认输出格式为:单声道,16 bit @ 16000Hz。 + +系统框图如下: + +.. figure:: ../../_static/esp_chinese_tts.png + :alt: chinese TTS + +简单示例 +-------- + +- :project_file:`esp-tts/samples/xiaoxin_speed1.wav` (voice=xiaoxin, speed=1):欢迎使用乐鑫语音合成,支付宝收款 72.1 元,微信收款 643.12 元,扫码收款 5489.54 元 +- :project_file:`esp-tts/samples/S2_xiaole_speed2.wav` (voice=xiaole, speed=2): 支付宝收款 1111.11 元 + +编程指南 +-------- + +.. code:: c + + #include "esp_tts.h" + #include "esp_tts_voice_female.h" + #include "esp_partition.h" + + /*** 1. create esp tts handle ***/ + + + // initial voice set from separate voice data partition + + const esp_partition_t* part=esp_partition_find_first(ESP_PARTITION_TYPE_DATA, ESP_PARTITION_SUBTYPE_DATA_FAT, "voice_data"); + if (part==0) printf("Couldn't find voice data partition!\n"); + spi_flash_mmap_handle_t mmap; + uint16_t* voicedata; + esp_err_t err=esp_partition_mmap(part, 0, part->size, SPI_FLASH_MMAP_DATA, (const void**)&voicedata, &mmap); + esp_tts_voice_t *voice=esp_tts_voice_set_init(&esp_tts_voice_template, voicedata); + + // 2. parse text and synthesis wave data + char *text="欢迎使用乐鑫语音合成"; + if (esp_tts_parse_chinese(tts_handle, text)) { // parse text into pinyin list + int len[1]={0}; + do { + short *data=esp_tts_stream_play(tts_handle, len, 4); // streaming synthesis + i2s_audio_play(data, len[0]*2, portMAX_DELAY); // i2s output + } while(len[0]>0); + i2s_zero_dma_buffer(0); + } + +更多参考,请前往 :project_file:`esp-tts/esp_tts_chinese/include/esp_tts.h` 查看 API 定义,或参考 ESP-Skainet 中 `chinese_tts `__ 示例. + + +资源消耗 +-------- + +有关本模型的资源消耗情况,请见 :doc:`资源消耗 <../benchmark/README>`。 \ No newline at end of file diff --git a/docs/zh_CN/wake_word_engine/README.rst b/docs/zh_CN/wake_word_engine/README.rst index a602bb1..151e38b 100644 --- a/docs/zh_CN/wake_word_engine/README.rst +++ b/docs/zh_CN/wake_word_engine/README.rst @@ -101,4 +101,4 @@ WakeNet 的使用 资源消耗 -------- -具体请参考 :doc:`资源消耗 <../benchmark/README>` 。 \ No newline at end of file +有关本模型的资源消耗情况,请见 :doc:`资源消耗 <../benchmark/README>`。 \ No newline at end of file diff --git a/esp-tts/README.md b/esp-tts/README.md index 0d1547b..d612f28 100644 --- a/esp-tts/README.md +++ b/esp-tts/README.md @@ -1,85 +1,3 @@ -# ESP Chinese TTS [[English]](./README_en.md) +# ESP Chinese TTS -乐鑫中文语音合成是一个为嵌入式系统设计的轻量化语音合成系统。 - -## Overview - -乐鑫语音合成当前版本基于拼接法,系统框图如下: - -![chinese TTS](./img/esp_chinese_tts.png) - -- Parser: 根据字典与语法规则,将输入文本转换为拼音列表, 输入文本编码为UTF-8。 -- Synthesizer: 根据Parser输出的拼音列表,结合预定义的声音集,合成波形文件。默认输出格式为单声道, 16bit@16000Hz。 - -#### Features: - -- [x] UTF-8编码输入 - -- [x] 流式输出,减少延时 - -- [x] 多音词发音自动识别 - -- [x] 可调节合成语速 - -- [x] 数字播报优化 - -- [ ] 自定义声音集 - - - -## Performance Test - -#### Resource Occupancy - -Flash image size: 2.2 MB - -RAM runtime: 20 KB - -CPU loading test(基于ESP32 @ 240MHz测试 ): - -| speech rate | 0 | 1 | 2 | 3 | 4 | 5 | -| --------------------------- | :--: | :--: | :--: | :--: | :--: | :--: | -| times faster than real time | 4.5 | 3.2 | 2.9 | 2.5 | 2.2 | 1.8 | - -#### Samples - - -- 欢迎使用乐鑫语音合成, 支付宝收款72.10元,微信收款643.12元,扫码收款5489.54元,     [voice=xiaoxin,speed=1](./samples/xiaoxin_speed1.wav) -- 支付宝收款 1111.11 元,     [voice=xiaole,speed=2](./samples/S2_xiaole_speed2.wav) - - - - -## User Guide - -```c -#include "esp_tts.h" -#include "esp_tts_voice_female.h" -#include "esp_partition.h" - -/*** 1. create esp tts handle ***/ - - -// initial voice set from separate voice data partition - -const esp_partition_t* part=esp_partition_find_first(ESP_PARTITION_TYPE_DATA, ESP_PARTITION_SUBTYPE_DATA_FAT, "voice_data"); -if (part==0) printf("Couldn't find voice data partition!\n"); -spi_flash_mmap_handle_t mmap; -uint16_t* voicedata; -esp_err_t err=esp_partition_mmap(part, 0, part->size, SPI_FLASH_MMAP_DATA, (const void**)&voicedata, &mmap); -esp_tts_voice_t *voice=esp_tts_voice_set_init(&esp_tts_voice_template, voicedata); - -// 2. parse text and synthesis wave data -char *text="欢迎使用乐鑫语音合成"; -if (esp_tts_parse_chinese(tts_handle, text)) { // parse text into pinyin list - int len[1]={0}; - do { - short *data=esp_tts_stream_play(tts_handle, len, 4); // streaming synthesis - i2s_audio_play(data, len[0]*2, portMAX_DELAY); // i2s output - } while(len[0]>0); - i2s_zero_dma_buffer(0); -} - -``` - -更多请参考[esp_tts.h](./esp_tts_chinese/include/esp_tts.h)查看API定义, 或参考esp-skainet中[chinese_tts](https://github.com/espressif/esp-skainet/tree/master/examples/chinese_tts)示例. +Espressif TTS speech synthesis model is a lightweight speech synthesis system designed for embedded systems. Currently, only the Chinese language is supported. See more documentation [Here](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/speech_synthesis/readme.html). \ No newline at end of file diff --git a/esp-tts/README_en.md b/esp-tts/README_en.md deleted file mode 100644 index 2e11f55..0000000 --- a/esp-tts/README_en.md +++ /dev/null @@ -1,92 +0,0 @@ -## ESP Chinese TTS [[中文]](./README.md) - -Espressif Chinese TTS is a lightweight TTS system designed for embedded systems。 - -## Overview - -The Chinese TTS is based on concatenative method. The flow diagram of system is as follows: - -![chinese TTS](./img/esp_chinese_tts.png) - -- **Parser** : a Chinese grapheme to phoneme module, input text (UTF-8) and output Chinese pinyin list. -- **Synthesizer** : a concatenative synthesizer, input pinyin list and output wave raw data. The default encoding of raw data is mono, 16 bit@16000 Hz. - -#### Features - -- [x] UTF-8 encoding text input - -- [x] Streaming output - -- [x] Polyphonic pronunciation - -- [x] Adjustable speech rate - -- [x] Digital broadcasting optimization - -- [ ] Custom sound set - - - -## Performance Test - -#### Resource Occupancy - -Flash image size: 2.2 MB - -RAM runtime: 20 KB - -CPU loading test(ESP32 @ 240 MHz): - -| speech rate | 0 | 1 | 2 | 3 | 4 | 5 | -| --------------------------- | :--: | :--: | :--: | :--: | :--: | :--: | -| times faster than real time | 4.5 | 3.2 | 2.9 | 2.5 | 2.2 | 1.8 | - -**Note:** the bigger rate, the faster speech speed. 0: slowest speaking speed, 5: fastest speaking speed. - -#### Samples - -- 欢迎使用乐鑫语音合成,     [voice=小乐,speed=0](./samples/S1_xiaole_speed0.wav),     [voice=小乐,speed=2](./samples/S1_xiaole_speed2.wav) - -- 支付宝收款 1111.11 元,     [voice=小乐,speed=0](./samples/S1_xiaole_speed0.wav),     [voice=小乐,speed=2](./samples/S2_xiaole_speed2.wav) - -- 空调制热模式已打开,并调节到25度,     [voice=小乐,speed=0](./samples/S3_xiaole_speed0.wav),     [voice=小乐,speed=4](./samples/S3_xiaole_speed4.wav) - -## User Guide - -```c -#include "esp_tts.h" -#include "esp_tts_voice_female.h" -#include "esp_partition.h" - -/*** 1. create esp tts handle ***/ - -//// Method1: use pre-define xiaole voice lib. -//// This method is not recommended because the method may make app bin exceed the limit of esp32 -// esp_tts_handle_t *tts_handle=esp_tts_create(esp_tts_voice_female); - - -// method2: initial voice set from separate voice data partition - -const esp_partition_t* part=esp_partition_find_first(ESP_PARTITION_TYPE_DATA, ESP_PARTITION_SUBTYPE_DATA_FAT, "voice_data"); -if (part==0) printf("Couldn't find voice data partition!\n"); -spi_flash_mmap_handle_t mmap; -uint16_t* voicedata; -esp_err_t err=esp_partition_mmap(part, 0, 3*1024*1024, SPI_FLASH_MMAP_DATA, (const void**)&voicedata, &mmap); -esp_tts_voice_t *voice=esp_tts_voice_set_init(&esp_tts_voice_template, voicedata); - -// 2. parse text and synthesis wave data -char *text="欢迎使用乐鑫语音合成"; -if (esp_tts_parse_chinese(tts_handle, text)) { // parse text into pinyin list - int len[1]={0}; - do { - short *data=esp_tts_stream_play(tts_handle, len, 4); // streaming synthesis - i2s_audio_play(data, len[0]*2, portMAX_DELAY); // i2s output - } while(len[0]>0); - i2s_zero_dma_buffer(0); -} - -``` - -please refer to [esp_tts.h](./esp_tts_chinese/include/esp_tts.h) for the details of API or [chinese_tts](../../examples/chinese_tts) example in esp-skainet. - -