mirror of
https://github.com/espressif/esp-sr.git
synced 2025-09-15 15:28:44 +08:00
Merge branch 'docs/fix_doc_menu' into 'master'
Docs/add_tts_doc See merge request speech-recognition-framework/esp-sr!15
This commit is contained in:
commit
64d29ece1e
|
Before Width: | Height: | Size: 22 KiB After Width: | Height: | Size: 22 KiB |
@ -21,7 +21,7 @@ project_slug = 'esp-sr'
|
||||
|
||||
# Contains info used for constructing target and version selector
|
||||
# Can also be hosted externally, see esp-idf for example
|
||||
versions_url = '_static/docs_version.js'
|
||||
versions_url = './_static/docs_version.js'
|
||||
|
||||
# Final PDF filename will contains target and version
|
||||
pdf_file_prefix = u'esp-sr'
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
{% extends '!layout.html' %}
|
||||
{% block comments %}
|
||||
<p style="text-align:center"><a href="https://www.espressif.com/en/company/documents/documentation_feedback?docId=4419§ions={{ title|striptags|e }} ({{ pagename }})&version={{ release }} ({{ version }})">Provide feedback about this document</a></p>
|
||||
<p style="text-align:center"><a href="https://www.espressif.com/en/company/documents/documentation_feedback?docId=6473§ions={{ title|striptags|e }} ({{ pagename }})&version={{ release }} ({{ version }})">Provide feedback about this document</a></p>
|
||||
{% endblock %}
|
||||
|
||||
@ -404,4 +404,10 @@ The usage of AEC is similar to that of WakeNet. Users can disable or enable AEC
|
||||
int wake_word_length; // the length of wake word. It's unit is the number of samples.
|
||||
int ret_value; // the return state of fetch function
|
||||
void* reserved; // reserved for future use
|
||||
} afe_fetch_result_t;
|
||||
} afe_fetch_result_t;
|
||||
|
||||
|
||||
Resource Occupancy
|
||||
------------------
|
||||
|
||||
For the resource occupancy for this model, see :doc:`Resource Occupancy <../benchmark/README>`.
|
||||
@ -12,7 +12,7 @@ Resource Occupancy
|
||||
.. only:: esp32
|
||||
|
||||
+-----------------+-----------------+-----------------+-----------------+
|
||||
| algorithm Type | RAM | Average cpu | Frame Length |
|
||||
| Algorithm Type | RAM | Average cpu | Frame Length |
|
||||
| | | loading(compute | |
|
||||
| | | with 2 cores) | |
|
||||
+=================+=================+=================+=================+
|
||||
@ -26,7 +26,7 @@ Resource Occupancy
|
||||
.. only:: esp32s3
|
||||
|
||||
+-----------------+-----------------+-----------------+-----------------+
|
||||
| algorithm Type | RAM | Average cpu | Frame Length |
|
||||
| Algorithm Type | RAM | Average cpu | Frame Length |
|
||||
| | | loading(compute | |
|
||||
| | | with 2 cores) | |
|
||||
+=================+=================+=================+=================+
|
||||
@ -155,16 +155,38 @@ Resource Occupancy
|
||||
Performance Test
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
+-----------+-----------+-----------+-----------+-----------+
|
||||
| Model | Distance | Quiet | S | Speech |
|
||||
| Type | | | tationary | Noise |
|
||||
| | | | Noise | (SNR = 4 |
|
||||
| | | | (SNR = 4 | dB) |
|
||||
| | | | dB) | |
|
||||
+===========+===========+===========+===========+===========+
|
||||
| MultiNet | 3 m | 98% | 93% | 92% |
|
||||
| 4 | | | | |
|
||||
+-----------+-----------+-----------+-----------+-----------+
|
||||
| MultiNet | 3 m | 94% | 92% | 91% |
|
||||
| 4 Q8 | | | | |
|
||||
+-----------+-----------+-----------+-----------+-----------+
|
||||
+-----------+-----------+----------+------------+-----------+
|
||||
| Model | Distance | Quiet | Stationary | Speech |
|
||||
| Type | | | Noise | Noise |
|
||||
| | | | (SNR = 4 | (SNR = 4 |
|
||||
| | | | dB) | dB) |
|
||||
+===========+===========+==========+============+===========+
|
||||
| MultiNet | 3 m | 98% | 93% | 92% |
|
||||
| 4 | | | | |
|
||||
+-----------+-----------+----------+------------+-----------+
|
||||
| MultiNet | 3 m | 94% | 92% | 91% |
|
||||
| 4 Q8 | | | | |
|
||||
+-----------+-----------+----------+------------+-----------+
|
||||
|
||||
|
||||
TTS
|
||||
---
|
||||
|
||||
Resource Occupancy
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Flash image size: 2.2 MB
|
||||
|
||||
RAM runtime: 20 KB
|
||||
|
||||
|
||||
Performance Test
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
CPU loading test (ESP32 @240 MHz):
|
||||
|
||||
+------------------------------+------+------+------+------+------+------+
|
||||
| Speech Rate | 0 | 1 | 2 | 3 | 4 | 5 |
|
||||
+==============================+======+======+======+======+======+======+
|
||||
| Times faster than real time | 4.5 | 3.2 | 2.9 | 2.5 | 2.2 | 1.8 |
|
||||
+------------------------------+------+------+------+------+------+------+
|
||||
@ -18,7 +18,7 @@ To use our models in your project, you need to flash these models. Currently, ES
|
||||
- Load directly from SIP Flash File System (SPIFFS)
|
||||
- Load from external SD card
|
||||
|
||||
So that on ESP32S3 you can:
|
||||
So that on ESP32-S3 you can:
|
||||
|
||||
- Greatly reduce the size of the user application APP BIN
|
||||
- Supports the selection of up to two wake words
|
||||
|
||||
@ -13,7 +13,7 @@ ESP-SR includes the following modules:
|
||||
* :doc:`Audio Front-end AFE <../audio_front_end/README>`
|
||||
* :doc:`Wake Word Engine WakeNet <../wake_word_engine/README>`
|
||||
* :doc:`Speech Command Word Recognition MultiNet <../speech_command_recognition/README>`
|
||||
* Speech Synthesis (only supports Chinese language)
|
||||
* :doc:`Speech Synthesis (only supports Chinese language) <../speech_synthesis/readme>`
|
||||
|
||||
What You Need
|
||||
-------------
|
||||
|
||||
@ -17,6 +17,7 @@ ESP-SR User Guide
|
||||
Audio Front-end (AFE) <audio_front_end/index>
|
||||
Wake Word WakeNet <wake_word_engine/index>
|
||||
Speech Command Word MultiNet <speech_command_recognition/README>
|
||||
Speech Synthesis (Only Supports Chinese Language) <speech_synthesis/readme>
|
||||
Flashing Models <flash_model/README>
|
||||
Resource Overhead <benchmark/README>
|
||||
Test Report <test_report/README>
|
||||
|
||||
@ -228,6 +228,12 @@ Therefore:
|
||||
* Single recognition mode: exit the speech recognition when the return status is ``ESP_MN_STATE_DETECTED``
|
||||
* Continuous recognition: exit the speech recognition when the return status is ``ESP_MN_STATE_TIMEOUT``
|
||||
|
||||
Resource Occupancy
|
||||
------------------
|
||||
|
||||
For the resource occupancy for this model, see :doc:`Resource Occupancy <../benchmark/README>`.
|
||||
|
||||
|
||||
Other configurations
|
||||
-----------------------
|
||||
|
||||
|
||||
74
docs/en/speech_synthesis/readme.rst
Normal file
74
docs/en/speech_synthesis/readme.rst
Normal file
@ -0,0 +1,74 @@
|
||||
TTS Speech Synthesis Model
|
||||
==========================
|
||||
|
||||
:link_to_translation:`zh_CN:[中文]`
|
||||
|
||||
Espressif TTS speech synthesis model is a lightweight speech synthesis system designed for embedded systems, with the following main features:
|
||||
|
||||
- Currently **Only supports Chinese language**
|
||||
- Input text is encoded in UTF-8
|
||||
- Streaming output, which reduces latency
|
||||
- Polyphonic pronunciation
|
||||
- Adjustable output speech rate
|
||||
- Digital broadcasting optimization
|
||||
- Customized sound set (coming soon)
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
Using a concatenative method, the current version of TTS includes the following components:
|
||||
|
||||
- Parser: converts Chinese text (encoded in UTF-8) to phonemes.
|
||||
- Synthesizer: generates wave raw data from the phonemes provided by the parser and the sound set. Default output format: mono, 16 bit @ 16000 Hz.
|
||||
|
||||
Workflow:
|
||||
|
||||
.. figure:: ../../_static/esp_chinese_tts.png
|
||||
:alt: chinese TTS
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
- :project_file:`esp-tts/samples/xiaoxin_speed1.wav` (voice=xiaoxin, speed=1): 欢迎使用乐鑫语音合成,支付宝收款 72.1 元,微信收款 643.12 元,扫码收款 5489.54 元
|
||||
- :project_file:`esp-tts/samples/S2_xiaole_speed2.wav` (voice=xiaole, speed=2): 支付宝收款 1111.11 元
|
||||
|
||||
Programming Procedures
|
||||
----------------------
|
||||
|
||||
.. code:: c
|
||||
|
||||
#include "esp_tts.h"
|
||||
#include "esp_tts_voice_female.h"
|
||||
#include "esp_partition.h"
|
||||
|
||||
/*** 1. create esp tts handle ***/
|
||||
|
||||
|
||||
// initial voice set from separate voice data partition
|
||||
|
||||
const esp_partition_t* part=esp_partition_find_first(ESP_PARTITION_TYPE_DATA, ESP_PARTITION_SUBTYPE_DATA_FAT, "voice_data");
|
||||
if (part==0) printf("Couldn't find voice data partition!\n");
|
||||
spi_flash_mmap_handle_t mmap;
|
||||
uint16_t* voicedata;
|
||||
esp_err_t err=esp_partition_mmap(part, 0, part->size, SPI_FLASH_MMAP_DATA, (const void**)&voicedata, &mmap);
|
||||
esp_tts_voice_t *voice=esp_tts_voice_set_init(&esp_tts_voice_template, voicedata);
|
||||
|
||||
// 2. parse text and synthesis wave data
|
||||
char *text="欢迎使用乐鑫语音合成";
|
||||
if (esp_tts_parse_chinese(tts_handle, text)) { // parse text into pinyin list
|
||||
int len[1]={0};
|
||||
do {
|
||||
short *data=esp_tts_stream_play(tts_handle, len, 4); // streaming synthesis
|
||||
i2s_audio_play(data, len[0]*2, portMAX_DELAY); // i2s output
|
||||
} while(len[0]>0);
|
||||
i2s_zero_dma_buffer(0);
|
||||
}
|
||||
|
||||
|
||||
See :project_file:`esp-tts/esp_tts_chinese/include/esp_tts.h` for API reference and see the `chinese_tts <https://github.com/espressif/esp-skainet/tree/master/examples/chinese_tts>`__ example in ESP-Skainet.
|
||||
|
||||
|
||||
Resource Occupancy
|
||||
------------------
|
||||
|
||||
For the resource occupancy for this model, see :doc:`Resource Occupancy <../benchmark/README>`.
|
||||
@ -98,7 +98,7 @@ Use WakeNet
|
||||
afe_handle->disable_wakenet(afe_data)
|
||||
afe_handle->enable_wakenet(afe_data)
|
||||
|
||||
Resource Consumption
|
||||
--------------------
|
||||
Resource Occupancy
|
||||
------------------
|
||||
|
||||
Please refer to :doc:`Resource Consumption <../benchmark/README>` .
|
||||
For the resource occupancy for this model, see :doc:`Resource Occupancy <../benchmark/README>`.
|
||||
@ -1,4 +1,4 @@
|
||||
{% extends '!layout.html' %}
|
||||
{% block comments %}
|
||||
<p style="text-align:center"><a href="https://www.espressif.com/zh-hans/company/documents/documentation_feedback?docId=4846§ions={{ title|striptags|e }} ({{ pagename }})&version={{ release }} ({{ version }})">提供有关此文档的反馈</a></p>
|
||||
<p style="text-align:center"><a href="https://www.espressif.com/zh-hans/company/documents/documentation_feedback?docId=6475§ions={{ title|striptags|e }} ({{ pagename }})&version={{ release }} ({{ version }})">提供有关此文档的反馈</a></p>
|
||||
{% endblock %}
|
||||
|
||||
@ -27,7 +27,7 @@ AEF 声学前端算法框架
|
||||
* - AGC (Automatic Gain Control)
|
||||
- 自动增益控制算法,可以动态调整输出音频的幅值,当弱信号输入时,放大输出幅度;当输入信号达到一定强度时,压缩输出幅度。
|
||||
* - WakeNet
|
||||
- 基于神经网络的唤醒词模型,专为低功耗潜入式 MCU 设计
|
||||
- 基于神经网络的唤醒词模型,专为低功耗嵌入式 MCU 设计
|
||||
|
||||
使用场景
|
||||
--------
|
||||
@ -319,7 +319,7 @@ AEC 的使用和 WakeNet 相似,用户可以根据自己的需求来停止或
|
||||
feed 音频数据
|
||||
^^^^^^^^^^^^^
|
||||
|
||||
在初始化 AFE 完成后,使用 :cpp:func: `feed` 函数,将音频数据输入到 AFE 模块中进行处理。输入音频的格式详见 :ref:`input-audio-1` 。
|
||||
在初始化 AFE 完成后,使用 :cpp:func:`feed` 函数,将音频数据输入到 AFE 模块中进行处理。输入音频的格式详见 :ref:`input-audio-1` 。
|
||||
|
||||
::
|
||||
|
||||
@ -405,3 +405,9 @@ AEC 的使用和 WakeNet 相似,用户可以根据自己的需求来停止或
|
||||
int ret_value; // the return state of fetch function
|
||||
void* reserved; // reserved for future use
|
||||
} afe_fetch_result_t;
|
||||
|
||||
|
||||
资源消耗
|
||||
--------
|
||||
|
||||
有关本模型的资源消耗情况,请见 :doc:`资源消耗 <../benchmark/README>`。
|
||||
@ -12,7 +12,7 @@ AFE
|
||||
.. only:: esp32
|
||||
|
||||
+-----------------+-----------------+-----------------+-----------------+
|
||||
| algorithm Type | RAM | Average cpu | Frame Length |
|
||||
| Algorithm Type | RAM | Average cpu | Frame Length |
|
||||
| | | loading(compute | |
|
||||
| | | with 2 cores) | |
|
||||
+=================+=================+=================+=================+
|
||||
@ -26,7 +26,7 @@ AFE
|
||||
.. only:: esp32s3
|
||||
|
||||
+-----------------+-----------------+-----------------+-----------------+
|
||||
| algorithm Type | RAM | Average cpu | Frame Length |
|
||||
| Algorithm Type | RAM | Average cpu | Frame Length |
|
||||
| | | loading(compute | |
|
||||
| | | with 2 cores) | |
|
||||
+=================+=================+=================+=================+
|
||||
@ -155,16 +155,38 @@ MultiNet
|
||||
性能测试
|
||||
~~~~~~~~
|
||||
|
||||
+-----------+-----------+-----------+-----------+-----------+
|
||||
| Model | Distance | Quiet | S | Speech |
|
||||
| Type | | | tationary | Noise |
|
||||
| | | | Noise | (SNR = 4 |
|
||||
| | | | (SNR = 4 | dB) |
|
||||
| | | | dB) | |
|
||||
+===========+===========+===========+===========+===========+
|
||||
| MultiNet | 3 m | 98% | 93% | 92% |
|
||||
| 4 | | | | |
|
||||
+-----------+-----------+-----------+-----------+-----------+
|
||||
| MultiNet | 3 m | 94% | 92% | 91% |
|
||||
| 4 Q8 | | | | |
|
||||
+-----------+-----------+-----------+-----------+-----------+
|
||||
+-----------+-----------+----------+------------+-----------+
|
||||
| Model | Distance | Quiet | Stationary | Speech |
|
||||
| Type | | | Noise | Noise |
|
||||
| | | | (SNR = 4 | (SNR = 4 |
|
||||
| | | | dB) | dB) |
|
||||
+===========+===========+==========+============+===========+
|
||||
| MultiNet | 3 m | 98% | 93% | 92% |
|
||||
| 4 | | | | |
|
||||
+-----------+-----------+----------+------------+-----------+
|
||||
| MultiNet | 3 m | 94% | 92% | 91% |
|
||||
| 4 Q8 | | | | |
|
||||
+-----------+-----------+----------+------------+-----------+
|
||||
|
||||
|
||||
TTS
|
||||
---
|
||||
|
||||
资源占用
|
||||
~~~~~~~~
|
||||
|
||||
Flash image size: 2.2 MB
|
||||
|
||||
RAM runtime: 20 KB
|
||||
|
||||
|
||||
性能测试
|
||||
~~~~~~~~
|
||||
|
||||
CPU 负载测试(ESP32 @240 MHz):
|
||||
|
||||
+------------------------------+------+------+------+------+------+------+
|
||||
| Speech Rate | 0 | 1 | 2 | 3 | 4 | 5 |
|
||||
+==============================+======+======+======+======+======+======+
|
||||
| Times faster than real time | 4.5 | 3.2 | 2.9 | 2.5 | 2.2 | 1.8 |
|
||||
+------------------------------+------+------+------+------+------+------+
|
||||
@ -12,8 +12,8 @@ ESP-SR 支持以下模块:
|
||||
|
||||
* :doc:`声学前端算法 AFE <../audio_front_end/README>`
|
||||
* :doc:`唤醒词检测 WakeNet <../wake_word_engine/README>`
|
||||
* :doc:`命令词识别 MultiNet<../speech_command_recognition/README>`
|
||||
* 语音合成(目前只支持中文)
|
||||
* :doc:`命令词识别 MultiNet <../speech_command_recognition/README>`
|
||||
* :doc:`语音合成(目前只支持中文)<../speech_synthesis/readme>`
|
||||
|
||||
准备工作
|
||||
--------
|
||||
|
||||
@ -18,6 +18,7 @@ ESP-SR 用户指南
|
||||
AFE 声学前端算法 <audio_front_end/index>
|
||||
语音唤醒 WakeNet <wake_word_engine/index>
|
||||
语音指令 MultiNet <speech_command_recognition/README>
|
||||
语音合成(仅支持中文)<speech_synthesis/readme>
|
||||
模型加载 <flash_model/README>
|
||||
资源消耗 <benchmark/README>
|
||||
测试报告 <test_report/README>
|
||||
|
||||
@ -228,6 +228,12 @@ MultiNet 命令词识别支持两种基本模式:
|
||||
当命令词识别返回状态为 ``ESP_MN_STATE_DETECTED`` 时退出命令词识别,则为单次识别模式;
|
||||
当命令词识别返回状态为 ``ESP_MN_STATE_TIMEOUT`` 时退出命令词识别,则为连续识别模式;
|
||||
|
||||
|
||||
资源消耗
|
||||
--------
|
||||
|
||||
有关本模型的资源消耗情况,请见 :doc:`资源消耗 <../benchmark/README>`。
|
||||
|
||||
其他配置和使用
|
||||
--------------
|
||||
|
||||
|
||||
73
docs/zh_CN/speech_synthesis/readme.rst
Normal file
73
docs/zh_CN/speech_synthesis/readme.rst
Normal file
@ -0,0 +1,73 @@
|
||||
TTS 语音合成模型
|
||||
================
|
||||
|
||||
:link_to_translation:`en:[English]`
|
||||
|
||||
乐鑫 TTS 语音合成模型是一个为嵌入式系统设计的轻量化语音合成系统,具有如下主要特性:
|
||||
|
||||
- 目前 **仅支持中文**
|
||||
- 输入文本采用 UTF-8 编码
|
||||
- 输出格式采用流输出,可减少延时
|
||||
- 多音词发音自动识别
|
||||
- 可调节合成语速
|
||||
- 数字播报优化
|
||||
- 自定义声音集(敬请期待)
|
||||
|
||||
简介
|
||||
----
|
||||
|
||||
乐鑫 TTS 的当前版本基于拼接法,主要组成部分包括:
|
||||
|
||||
- 解析器 (Parser):根据字典与语法规则,将输入文本(采用 UTF-8 编码)转换为拼音列表。
|
||||
- 合成器 (Synthesizer):根据解析器输出的拼音列表,结合预定义的声音集,合成波形文件。默认输出格式为:单声道,16 bit @ 16000Hz。
|
||||
|
||||
系统框图如下:
|
||||
|
||||
.. figure:: ../../_static/esp_chinese_tts.png
|
||||
:alt: chinese TTS
|
||||
|
||||
简单示例
|
||||
--------
|
||||
|
||||
- :project_file:`esp-tts/samples/xiaoxin_speed1.wav` (voice=xiaoxin, speed=1):欢迎使用乐鑫语音合成,支付宝收款 72.1 元,微信收款 643.12 元,扫码收款 5489.54 元
|
||||
- :project_file:`esp-tts/samples/S2_xiaole_speed2.wav` (voice=xiaole, speed=2): 支付宝收款 1111.11 元
|
||||
|
||||
编程指南
|
||||
--------
|
||||
|
||||
.. code:: c
|
||||
|
||||
#include "esp_tts.h"
|
||||
#include "esp_tts_voice_female.h"
|
||||
#include "esp_partition.h"
|
||||
|
||||
/*** 1. create esp tts handle ***/
|
||||
|
||||
|
||||
// initial voice set from separate voice data partition
|
||||
|
||||
const esp_partition_t* part=esp_partition_find_first(ESP_PARTITION_TYPE_DATA, ESP_PARTITION_SUBTYPE_DATA_FAT, "voice_data");
|
||||
if (part==0) printf("Couldn't find voice data partition!\n");
|
||||
spi_flash_mmap_handle_t mmap;
|
||||
uint16_t* voicedata;
|
||||
esp_err_t err=esp_partition_mmap(part, 0, part->size, SPI_FLASH_MMAP_DATA, (const void**)&voicedata, &mmap);
|
||||
esp_tts_voice_t *voice=esp_tts_voice_set_init(&esp_tts_voice_template, voicedata);
|
||||
|
||||
// 2. parse text and synthesis wave data
|
||||
char *text="欢迎使用乐鑫语音合成";
|
||||
if (esp_tts_parse_chinese(tts_handle, text)) { // parse text into pinyin list
|
||||
int len[1]={0};
|
||||
do {
|
||||
short *data=esp_tts_stream_play(tts_handle, len, 4); // streaming synthesis
|
||||
i2s_audio_play(data, len[0]*2, portMAX_DELAY); // i2s output
|
||||
} while(len[0]>0);
|
||||
i2s_zero_dma_buffer(0);
|
||||
}
|
||||
|
||||
更多参考,请前往 :project_file:`esp-tts/esp_tts_chinese/include/esp_tts.h` 查看 API 定义,或参考 ESP-Skainet 中 `chinese_tts <https://github.com/espressif/esp-skainet/tree/master/examples/chinese_tts>`__ 示例.
|
||||
|
||||
|
||||
资源消耗
|
||||
--------
|
||||
|
||||
有关本模型的资源消耗情况,请见 :doc:`资源消耗 <../benchmark/README>`。
|
||||
@ -101,4 +101,4 @@ WakeNet 的使用
|
||||
资源消耗
|
||||
--------
|
||||
|
||||
具体请参考 :doc:`资源消耗 <../benchmark/README>` 。
|
||||
有关本模型的资源消耗情况,请见 :doc:`资源消耗 <../benchmark/README>`。
|
||||
@ -1,85 +1,3 @@
|
||||
# ESP Chinese TTS [[English]](./README_en.md)
|
||||
# ESP Chinese TTS
|
||||
|
||||
乐鑫中文语音合成是一个为嵌入式系统设计的轻量化语音合成系统。
|
||||
|
||||
## Overview
|
||||
|
||||
乐鑫语音合成当前版本基于拼接法,系统框图如下:
|
||||
|
||||

|
||||
|
||||
- Parser: 根据字典与语法规则,将输入文本转换为拼音列表, 输入文本编码为UTF-8。
|
||||
- Synthesizer: 根据Parser输出的拼音列表,结合预定义的声音集,合成波形文件。默认输出格式为单声道, 16bit@16000Hz。
|
||||
|
||||
#### Features:
|
||||
|
||||
- [x] UTF-8编码输入
|
||||
|
||||
- [x] 流式输出,减少延时
|
||||
|
||||
- [x] 多音词发音自动识别
|
||||
|
||||
- [x] 可调节合成语速
|
||||
|
||||
- [x] 数字播报优化
|
||||
|
||||
- [ ] 自定义声音集
|
||||
|
||||
|
||||
|
||||
## Performance Test
|
||||
|
||||
#### Resource Occupancy
|
||||
|
||||
Flash image size: 2.2 MB
|
||||
|
||||
RAM runtime: 20 KB
|
||||
|
||||
CPU loading test(基于ESP32 @ 240MHz测试 ):
|
||||
|
||||
| speech rate | 0 | 1 | 2 | 3 | 4 | 5 |
|
||||
| --------------------------- | :--: | :--: | :--: | :--: | :--: | :--: |
|
||||
| times faster than real time | 4.5 | 3.2 | 2.9 | 2.5 | 2.2 | 1.8 |
|
||||
|
||||
#### Samples
|
||||
|
||||
|
||||
- 欢迎使用乐鑫语音合成, 支付宝收款72.10元,微信收款643.12元,扫码收款5489.54元, [voice=xiaoxin,speed=1](./samples/xiaoxin_speed1.wav)
|
||||
- 支付宝收款 1111.11 元, [voice=xiaole,speed=2](./samples/S2_xiaole_speed2.wav)
|
||||
|
||||
|
||||
|
||||
|
||||
## User Guide
|
||||
|
||||
```c
|
||||
#include "esp_tts.h"
|
||||
#include "esp_tts_voice_female.h"
|
||||
#include "esp_partition.h"
|
||||
|
||||
/*** 1. create esp tts handle ***/
|
||||
|
||||
|
||||
// initial voice set from separate voice data partition
|
||||
|
||||
const esp_partition_t* part=esp_partition_find_first(ESP_PARTITION_TYPE_DATA, ESP_PARTITION_SUBTYPE_DATA_FAT, "voice_data");
|
||||
if (part==0) printf("Couldn't find voice data partition!\n");
|
||||
spi_flash_mmap_handle_t mmap;
|
||||
uint16_t* voicedata;
|
||||
esp_err_t err=esp_partition_mmap(part, 0, part->size, SPI_FLASH_MMAP_DATA, (const void**)&voicedata, &mmap);
|
||||
esp_tts_voice_t *voice=esp_tts_voice_set_init(&esp_tts_voice_template, voicedata);
|
||||
|
||||
// 2. parse text and synthesis wave data
|
||||
char *text="欢迎使用乐鑫语音合成";
|
||||
if (esp_tts_parse_chinese(tts_handle, text)) { // parse text into pinyin list
|
||||
int len[1]={0};
|
||||
do {
|
||||
short *data=esp_tts_stream_play(tts_handle, len, 4); // streaming synthesis
|
||||
i2s_audio_play(data, len[0]*2, portMAX_DELAY); // i2s output
|
||||
} while(len[0]>0);
|
||||
i2s_zero_dma_buffer(0);
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
更多请参考[esp_tts.h](./esp_tts_chinese/include/esp_tts.h)查看API定义, 或参考esp-skainet中[chinese_tts](https://github.com/espressif/esp-skainet/tree/master/examples/chinese_tts)示例.
|
||||
Espressif TTS speech synthesis model is a lightweight speech synthesis system designed for embedded systems. Currently, only the Chinese language is supported. See more documentation [Here](https://docs.espressif.com/projects/esp-sr/en/latest/esp32s3/speech_synthesis/readme.html).
|
||||
@ -1,92 +0,0 @@
|
||||
## ESP Chinese TTS [[中文]](./README.md)
|
||||
|
||||
Espressif Chinese TTS is a lightweight TTS system designed for embedded systems。
|
||||
|
||||
## Overview
|
||||
|
||||
The Chinese TTS is based on concatenative method. The flow diagram of system is as follows:
|
||||
|
||||

|
||||
|
||||
- **Parser** : a Chinese grapheme to phoneme module, input text (UTF-8) and output Chinese pinyin list.
|
||||
- **Synthesizer** : a concatenative synthesizer, input pinyin list and output wave raw data. The default encoding of raw data is mono, 16 bit@16000 Hz.
|
||||
|
||||
#### Features
|
||||
|
||||
- [x] UTF-8 encoding text input
|
||||
|
||||
- [x] Streaming output
|
||||
|
||||
- [x] Polyphonic pronunciation
|
||||
|
||||
- [x] Adjustable speech rate
|
||||
|
||||
- [x] Digital broadcasting optimization
|
||||
|
||||
- [ ] Custom sound set
|
||||
|
||||
|
||||
|
||||
## Performance Test
|
||||
|
||||
#### Resource Occupancy
|
||||
|
||||
Flash image size: 2.2 MB
|
||||
|
||||
RAM runtime: 20 KB
|
||||
|
||||
CPU loading test(ESP32 @ 240 MHz):
|
||||
|
||||
| speech rate | 0 | 1 | 2 | 3 | 4 | 5 |
|
||||
| --------------------------- | :--: | :--: | :--: | :--: | :--: | :--: |
|
||||
| times faster than real time | 4.5 | 3.2 | 2.9 | 2.5 | 2.2 | 1.8 |
|
||||
|
||||
**Note:** the bigger rate, the faster speech speed. 0: slowest speaking speed, 5: fastest speaking speed.
|
||||
|
||||
#### Samples
|
||||
|
||||
- 欢迎使用乐鑫语音合成, [voice=小乐,speed=0](./samples/S1_xiaole_speed0.wav), [voice=小乐,speed=2](./samples/S1_xiaole_speed2.wav)
|
||||
|
||||
- 支付宝收款 1111.11 元, [voice=小乐,speed=0](./samples/S1_xiaole_speed0.wav), [voice=小乐,speed=2](./samples/S2_xiaole_speed2.wav)
|
||||
|
||||
- 空调制热模式已打开,并调节到25度, [voice=小乐,speed=0](./samples/S3_xiaole_speed0.wav), [voice=小乐,speed=4](./samples/S3_xiaole_speed4.wav)
|
||||
|
||||
## User Guide
|
||||
|
||||
```c
|
||||
#include "esp_tts.h"
|
||||
#include "esp_tts_voice_female.h"
|
||||
#include "esp_partition.h"
|
||||
|
||||
/*** 1. create esp tts handle ***/
|
||||
|
||||
//// Method1: use pre-define xiaole voice lib.
|
||||
//// This method is not recommended because the method may make app bin exceed the limit of esp32
|
||||
// esp_tts_handle_t *tts_handle=esp_tts_create(esp_tts_voice_female);
|
||||
|
||||
|
||||
// method2: initial voice set from separate voice data partition
|
||||
|
||||
const esp_partition_t* part=esp_partition_find_first(ESP_PARTITION_TYPE_DATA, ESP_PARTITION_SUBTYPE_DATA_FAT, "voice_data");
|
||||
if (part==0) printf("Couldn't find voice data partition!\n");
|
||||
spi_flash_mmap_handle_t mmap;
|
||||
uint16_t* voicedata;
|
||||
esp_err_t err=esp_partition_mmap(part, 0, 3*1024*1024, SPI_FLASH_MMAP_DATA, (const void**)&voicedata, &mmap);
|
||||
esp_tts_voice_t *voice=esp_tts_voice_set_init(&esp_tts_voice_template, voicedata);
|
||||
|
||||
// 2. parse text and synthesis wave data
|
||||
char *text="欢迎使用乐鑫语音合成";
|
||||
if (esp_tts_parse_chinese(tts_handle, text)) { // parse text into pinyin list
|
||||
int len[1]={0};
|
||||
do {
|
||||
short *data=esp_tts_stream_play(tts_handle, len, 4); // streaming synthesis
|
||||
i2s_audio_play(data, len[0]*2, portMAX_DELAY); // i2s output
|
||||
} while(len[0]>0);
|
||||
i2s_zero_dma_buffer(0);
|
||||
}
|
||||
|
||||
```
|
||||
|
||||
please refer to [esp_tts.h](./esp_tts_chinese/include/esp_tts.h) for the details of API or [chinese_tts](../../examples/chinese_tts) example in esp-skainet.
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user