Merge branch 'docs/md_2_rst_gitlab' into 'master'

doec/trans_md_2_rst

See merge request speech-recognition-framework/esp-sr!2
This commit is contained in:
Sun Xiang Yu 2022-11-30 10:21:44 +08:00
commit 5f7faa0328
88 changed files with 4289 additions and 2100 deletions

30
.gitignore vendored
View File

@ -1,4 +1,32 @@
# VS Code Settings
.vscode/
include/config
include/sdkconfig.h
build/
sdkconfig.old
sdkconfig
model/target/*
<<<<<<< HEAD
.DS_Store
*.pyc
# Doc build artifacts
docs/_build/
docs/*/_build/
docs/*/doxygen-warning-log.txt
docs/*/sphinx-warning-log.txt
docs/*/sphinx-warning-log-sanitized.txt
docs/*/xml/
docs/*/xml_in/
docs/*/man/
docs/doxygen_sqlite3.db
# Downloaded font files
docs/_static/DejaVuSans.ttf
docs/_static/NotoSansSC-Regular.otf
=======
model/target/*
.vscode
docs/_build/*
>>>>>>> 0981bc8425d6cace35ebb73789265a1c2e14dc92

160
.gitlab-ci.yml Normal file
View File

@ -0,0 +1,160 @@
stages:
- build
- deploy
- deploy_docs
# global variables
variables: &global-variables
ESP_DOCS_ENV_IMAGE: "$CI_DOCKER_REGISTRY/esp-idf-doc-env-v5.0:2-2"
IDF_PATH: "$CI_PROJECT_DIR/esp-idf"
IDF_REPO: ${GITLAB_SSH_SERVER}/espressif/esp-idf.git
GIT_STRATEGY: clone
GIT_SUBMODULE_STRATEGY: recursive
ESPCI_TOKEN: $GITLAB_KEY
before_script:
- echo $ESP_DOCS_ENV_IMAGE
default:
retry:
max: 2
# In case of a runner failure we could hop to another one, or a network error could go away.
when: always
.setup_idf_ci_env: &setup_idf_ci_env
- source esp-idf/tools/ci/utils.sh
- source esp-idf/tools/ci/configure_ci_environment.sh
- esp-idf/tools/idf_tools.py install
- esp-idf/tools/idf_tools.py export
doc_build_html_en:
stage: build
image: $ESP_DOCS_ENV_IMAGE
tags:
- build_docs
needs: []
artifacts:
when: always
paths:
- docs/_build/*/*/*.txt
- docs/_build/*/*/html/*
expire_in: 6 mos
script:
- cd docs
- ./check_lang_folder_sync.sh
- ./check_doc_chars.py
- pip install -r requirements.txt
# build html
- build-docs -t esp32 -l en -bs html
doc_build_html_cn:
stage: build
image: $ESP_DOCS_ENV_IMAGE
tags:
- build_docs
needs: []
artifacts:
when: always
paths:
- docs/_build/*/*/*.txt
- docs/_build/*/*/html/*
expire_in: 6 mos
script:
- cd docs
- ./check_lang_folder_sync.sh
- ./check_doc_chars.py
- pip install -r requirements.txt
# build html
- build-docs -t esp32 -l zh_CN -bs html
# Separate PDF build and HTML build due to artifacts size limit.
doc_build_pdf_en:
stage: build
image: $ESP_DOCS_ENV_IMAGE
tags:
- build_docs
needs: []
artifacts:
when: always
paths:
- docs/_build/*/*/*.txt
- docs/_build/*/*/latex/*
expire_in: 6 mos
script:
- cd docs
- ./check_lang_folder_sync.sh
- ./check_doc_chars.py
- pip install -r requirements.txt
# build pdf
- build-docs -t esp32 -l en -bs latex
doc_build_pdf_cn:
stage: build
image: $ESP_DOCS_ENV_IMAGE
tags:
- build_docs
needs: []
artifacts:
when: always
paths:
- docs/_build/*/*/*.txt
- docs/_build/*/*/latex/*
expire_in: 6 mos
script:
- cd docs
- ./check_lang_folder_sync.sh
- ./check_doc_chars.py
- pip install -r requirements.txt
# build pdf
- build-docs -t esp32 -l zh_CN -bs latex
.deploy_docs_template:
stage: deploy_docs
image: $ESP_DOCS_ENV_IMAGE
tags:
- deploy
needs:
- doc_build_html_en
- doc_build_html_cn
- doc_build_pdf_en
- doc_build_pdf_cn
script:
- source ${CI_PROJECT_DIR}/docs/utils.sh
- add_doc_server_ssh_keys $DOCS_DEPLOY_PRIVATEKEY $DOCS_DEPLOY_SERVER $DOCS_DEPLOY_SERVER_USER
- export GIT_VER=$(git describe --always)
- pip install -r ${CI_PROJECT_DIR}/docs/requirements.txt
- deploy-docs
deploy_docs_preview:
extends:
- .deploy_docs_template
except:
refs:
- master
variables:
TYPE: "preview"
DOCS_BUILD_DIR: "${CI_PROJECT_DIR}/docs/_build/"
DOCS_DEPLOY_PRIVATEKEY: "$DOCS_PREVIEW_DEPLOY_KEY"
DOCS_DEPLOY_SERVER: "$DOCS_PREVIEW_SERVER"
DOCS_DEPLOY_SERVER_USER: "$DOCS_PREVIEW_SERVER_USER"
DOCS_DEPLOY_PATH: "$DOCS_PREVIEW_PATH"
DOCS_DEPLOY_URL_BASE: "https://$DOCS_PREVIEW_SERVER_URL/docs/esp-sr"
deploy_docs_production:
extends:
- .deploy_docs_template
# only:
# refs:
# - master
# - /^release\/v.*$/
except:
refs:
- master
variables:
TYPE: "production"
DOCS_BUILD_DIR: "${CI_PROJECT_DIR}/docs/_build/"
DOCS_DEPLOY_PRIVATEKEY: "$DOCS_PROD_DEPLOY_KEY"
DOCS_DEPLOY_SERVER: "$DOCS_PROD_SERVER"
DOCS_DEPLOY_SERVER_USER: "$DOCS_PROD_SERVER_USER"
DOCS_DEPLOY_PATH: "$DOCS_PROD_PATH"
DOCS_DEPLOY_URL_BASE: "https://docs.espressif.com/projects/esp-sr"

18
.readthedocs.yml Normal file
View File

@ -0,0 +1,18 @@
# .readthedocs.yml
# Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
# Required
version: 2
# Optionally build your docs in additional formats such as PDF and ePub
formats:
- htmlzip
- pdf
# Optionally set the version of Python and requirements required to build your docs
python:
version: 3.7
install:
- requirements: docs/setuptools.requirements.txt
- requirements: docs/requirements.txt

70
docs/Doxyfile Executable file
View File

@ -0,0 +1,70 @@
# This is Doxygen configuration file
#
# Doxygen provides over 260 configuration statements
# To make this file easier to follow,
# it contains only statements that are non-default
#
# NOTE:
# It is recommended not to change defaults unless specifically required
# Test any changes how they affect generated documentation
# Make sure that correct warnings are generated to flag issues with documented code
#
# For the complete list of configuration statements see:
# http://doxygen.nl/manual/config.html
PROJECT_NAME = "ESP-SR User Guide"
## The 'INPUT' statement below is used as input by script 'gen-df-input.py'
## to automatically generate API reference list files heder_file.inc
## These files are placed in '_inc' directory
## and used to include in API reference documentation
INPUT = \
## $(PROJECT_PATH)/src/include/esp_mn_speech_commands.h
## $(PROJECT_PATH)/components/at/include/esp_at.h
## Get warnings for functions that have no documentation for their parameters or return value
##
WARN_NO_PARAMDOC = YES
## Enable preprocessing and remove __attribute__(...) expressions from the INPUT files
##
ENABLE_PREPROCESSING = YES
MACRO_EXPANSION = YES
EXPAND_ONLY_PREDEF = YES
PREDEFINED = \
__attribute__(x)= \
IDF_DEPRECATED(X)= \
IRAM_ATTR= \
configSUPPORT_DYNAMIC_ALLOCATION=1 \
configSUPPORT_STATIC_ALLOCATION=1 \
configQUEUE_REGISTRY_SIZE=1 \
configUSE_RECURSIVE_MUTEXES=1 \
configTHREAD_LOCAL_STORAGE_DELETE_CALLBACKS=1 \
configNUM_THREAD_LOCAL_STORAGE_POINTERS=1 \
configUSE_APPLICATION_TASK_TAG=1 \
configTASKLIST_INCLUDE_COREID=1
## Do not complain about not having dot
##
HAVE_DOT = NO
## Generate XML that is required for Breathe
##
GENERATE_XML = YES
XML_OUTPUT = xml
GENERATE_HTML = NO
HAVE_DOT = NO
GENERATE_LATEX = NO
GENERATE_MAN = YES
GENERATE_RTF = NO
## Skip distracting progress messages
##
QUIET = YES
## Log warnings in a file for further review
##
WARN_LOGFILE = "doxygen-warning-log.txt"

Binary file not shown.

272
docs/_static/404-page__cn.svg vendored Normal file
View File

@ -0,0 +1,272 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- Generator: Adobe Illustrator 23.0.2, SVG Export Plug-In . SVG Version: 6.00 Build 0) -->
<svg version="1.1"
id="图层_1" xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 1000 580"
style="enable-background:new 0 0 1000 580;" xml:space="preserve">
<style type="text/css">
.st0{fill:#FFFFFF;}
.st1{fill:url(#polygon12_1_);}
.st2{fill:url(#polygon21_1_);}
.st3{opacity:0.27;fill:url(#circle42_1_);enable-background:new ;}
.st4{opacity:0.27;fill:url(#circle49_1_);enable-background:new ;}
.st5{fill:url(#polygon58_1_);}
.st6{fill:#444444;stroke:#FFFFFF;stroke-width:0.834;stroke-miterlimit:10;}
.st7{fill:none;stroke:#FFFFFF;stroke-width:1.1033;stroke-miterlimit:10;}
.st8{fill:none;stroke:#353535;stroke-width:1.1033;stroke-miterlimit:10;}
.st9{fill:#FFFFFF;stroke:#444444;stroke-width:0.834;stroke-miterlimit:10;}
.st10{fill:#444444;stroke:#FFFFFF;stroke-width:0.8485;stroke-miterlimit:10;}
.st11{fill:none;stroke:#FFFFFF;stroke-width:1.1226;stroke-miterlimit:10;}
.st12{fill:none;stroke:#353535;stroke-width:1.1226;stroke-miterlimit:10;}
.st13{fill:#FFFFFF;stroke:#444444;stroke-width:0.8485;stroke-miterlimit:10;}
.st14{fill:#353535;}
.st15{fill:#444444;stroke:#FFFFFF;stroke-width:0.9321;stroke-miterlimit:10;}
.st16{fill:none;stroke:#FFFFFF;stroke-width:1.046;stroke-miterlimit:10;}
.st17{fill:none;stroke:#353535;stroke-width:1.046;stroke-miterlimit:10;}
.st18{fill:#FFFFFF;stroke:#444444;stroke-width:0.7906;stroke-miterlimit:10;}
.st19{opacity:0.59;fill:#E0E0E0;enable-background:new ;}
.st20{fill:#FFFFFF;stroke:#444444;stroke-width:2;stroke-miterlimit:10;}
.st21{fill:none;stroke:#444444;stroke-width:2;stroke-miterlimit:10;}
.st22{fill:none;stroke:#444444;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;}
.st23{enable-background:new ;}
.st24{fill:#4D4D4D;}
</style>
<rect id="BG_2_" x="-1" y="-9.5" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st0" width="1012.9" height="600.4">
</rect>
<linearGradient id="polygon12_1_" gradientUnits="userSpaceOnUse" x1="1056.6168" y1="442.7242" x2="1119.4504" y2="176.2231" gradientTransform="matrix(0.9556 0.295 0.2974 -0.9605 -602.8147 155.4956)">
<stop offset="4.835800e-02" style="stop-color:#9FA0A0"/>
<stop offset="0.5227" style="stop-color:#D7D8D8;stop-opacity:0.4381"/>
<stop offset="0.8926" style="stop-color:#FFFFFF;stop-opacity:0"/>
</linearGradient>
<polygon id="polygon12" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st1" points="604.8,364.6
508,127.4 487,126.8 426.8,369.8 ">
</polygon>
<linearGradient id="polygon21_1_" gradientUnits="userSpaceOnUse" x1="190.9867" y1="163.1146" x2="275.0967" y2="-193.6272" gradientTransform="matrix(0.9983 -5.887031e-02 -5.887031e-02 -0.9983 70.2473 159.8108)">
<stop offset="4.835800e-02" style="stop-color:#898989"/>
<stop offset="0.5874" style="stop-color:#D7D7D7;stop-opacity:0.3616"/>
<stop offset="0.8926" style="stop-color:#FFFFFF;stop-opacity:0"/>
</linearGradient>
<polygon id="polygon21" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st2" points="325.4,149.1
289.2,157.9 291.9,431 480.9,444.4 ">
</polygon>
<radialGradient id="circle42_1_" cx="836.3" cy="73.2901" r="65.713" gradientTransform="matrix(1 0 0 -1 12 560.79)" gradientUnits="userSpaceOnUse">
<stop offset="0" style="stop-color:#FFFFFF"/>
<stop offset="1" style="stop-color:#FFFFFF;stop-opacity:0"/>
</radialGradient>
<circle id="circle42" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st3" cx="848.3" cy="487.5" r="68.3">
</circle>
<radialGradient id="circle49_1_" cx="473.9905" cy="506.6938" r="65.7439" gradientTransform="matrix(1 0 0 -1 12 598.79)" gradientUnits="userSpaceOnUse">
<stop offset="0" style="stop-color:#FFFFFF"/>
<stop offset="1" style="stop-color:#FFFFFF;stop-opacity:0"/>
</radialGradient>
<circle id="circle49" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st4" cx="486" cy="92.1" r="68.3">
</circle>
<linearGradient id="polygon58_1_" gradientUnits="userSpaceOnUse" x1="1955.3258" y1="84.9918" x2="2021.5074" y2="-195.7096" gradientTransform="matrix(0.8607 0.5092 0.5092 -0.8607 -969.5651 -847.6453)">
<stop offset="4.835800e-02" style="stop-color:#898989"/>
<stop offset="0.5874" style="stop-color:#D7D7D7;stop-opacity:0.3616"/>
<stop offset="0.8926" style="stop-color:#FFFFFF;stop-opacity:0"/>
</linearGradient>
<polygon id="polygon58" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st5" points="547.4,377
763.1,422.2 713.7,157.4 683.8,150 ">
</polygon>
<g id="g94" transform="rotate(9.0573675,796.06564,263.99283)" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476">
<path id="path60" inkscape:connector-curvature="0" class="st6" d="M515,153.6L439.2,164c-5.6,0.8-10.8-3.2-11.5-8.7l0,0
c-0.8-5.6,3.2-10.8,8.7-11.5l75.8-10.4c5.6-0.8,10.8,3.2,11.5,8.7l0,0C524.5,147.7,520.6,152.8,515,153.6L515,153.6z"/>
<ellipse id="circle62" transform="matrix(-0.9875 0.1574 -0.1574 -0.9875 905.3914 234.4057)" class="st7" cx="443.4" cy="153.1" rx="5.6" ry="5.6"/>
<ellipse id="circle64" transform="matrix(-0.9875 0.1574 -0.1574 -0.9875 949.0156 224.9154)" class="st7" cx="465.6" cy="150" rx="5.6" ry="5.6"/>
<ellipse id="circle66" transform="matrix(-0.9875 0.1574 -0.1574 -0.9875 943.6016 103.0671)" class="st8" cx="467.7" cy="88.9" rx="3.9" ry="3.9"/>
<ellipse id="circle68" transform="matrix(-0.9875 0.1574 -0.1574 -0.9875 993.4196 215.2251)" class="st7" cx="488.2" cy="147" rx="5.6" ry="5.6"/>
<ellipse id="circle70" transform="matrix(-0.3136 0.9496 -0.9496 -0.3136 806.4566 -295.6625)" class="st7" cx="510.1" cy="143.7" rx="5.6" ry="5.6"/>
<path id="path72" inkscape:connector-curvature="0" class="st9" d="M505,132.9l-61.3,8.4l-0.4-2.8c-0.2-1.5,0.8-2.9,2.3-3.1
l55.7-7.7c1.5-0.2,2.9,0.8,3.1,2.3L505,132.9z"/>
<path id="path74" inkscape:connector-curvature="0" class="st9" d="M514.9,155.1l-75,10.3l0.3,2.4c0.3,1.7,1.8,2.8,3.4,2.6
l69.1-9.5c1.7-0.3,2.8-1.8,2.6-3.4L514.9,155.1z"/>
<path id="path76" inkscape:connector-curvature="0" class="st8" d="M503.2,163.8l-50.5,6.9l0.4,3c0.2,1.3,1.5,2.3,2.8,2.2l45.5-6.3
c1.3-0.2,2.3-1.5,2.2-2.8L503.2,163.8z"/>
<path id="path78" inkscape:connector-curvature="0" class="st8" d="M494.5,127.1l-43.7,6l-1.8-13.2c-1-7.3,4.1-14,11.3-15l17.3-2.4
c7.3-1,14,4.1,15,11.3L494.5,127.1z"/>
<line id="line80" class="st8" x1="468.5" y1="94" x2="469.6" y2="102.5"/>
<line id="line82" class="st8" x1="466.6" y1="112.8" x2="459.5" y2="125.1"/>
<line id="line84" class="st8" x1="480.9" y1="111.3" x2="473.7" y2="123.6"/>
<path id="path86" inkscape:connector-curvature="0" class="st9" d="M465.7,176.2l-3.7,10.2c-0.2,0.5,0.5,0.9,0.8,0.5l10.5-11.3
c0.3-0.3,0-0.9-0.4-0.8l-6.8,1C466,175.9,465.8,176,465.7,176.2z"/>
<path id="path88" inkscape:connector-curvature="0" class="st9" d="M455.7,177.4l-6.3,9.2c-0.3,0.5,0.3,1,0.8,0.7l12.8-10.1
c0.4-0.3,0.2-0.9-0.4-0.9l-6.6,0.9C455.9,177.3,455.8,177.4,455.7,177.4L455.7,177.4z"/>
<path id="path90" inkscape:connector-curvature="0" class="st9" d="M491.6,172.4l6.3,8.8c0.3,0.4-0.2,1-0.7,0.8l-13.2-8
c-0.4-0.3-0.3-0.8,0.2-0.9l6.8-0.8C491.3,172.2,491.5,172.3,491.6,172.4L491.6,172.4z"/>
<path id="path92" inkscape:connector-curvature="0" class="st9" d="M501.7,170.9l8.5,7.2c0.4,0.3,0,1.1-0.5,0.8l-15.1-6.3
c-0.5-0.2-0.4-0.9,0.1-0.9l6.6-0.9C501.4,170.8,501.5,170.8,501.7,170.9L501.7,170.9z"/>
</g>
<g id="g130" transform="translate(-131.09867,-443.26745)" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476">
<path id="path96" inkscape:connector-curvature="0" class="st10" d="M875,588.1l-76.7-13.2c-5.6-0.9-9.4-6.4-8.5-12l0,0
c0.9-5.6,6.4-9.4,12-8.5l76.7,13.2c5.6,0.9,9.4,6.4,8.5,12l0,0C885.9,585.3,880.6,589.1,875,588.1z"/>
<circle id="circle98" class="st11" cx="805.6" cy="565.5" r="5.7"/>
<circle id="circle100" class="st11" cx="828.2" cy="569.4" r="5.7"/>
<circle id="circle102" class="st12" cx="849" cy="510.8" r="4"/>
<circle id="circle104" class="st11" cx="851" cy="573.3" r="5.7"/>
<circle id="circle106" class="st11" cx="873.2" cy="577" r="5.7"/>
<path id="path108" inkscape:connector-curvature="0" class="st13" d="M871.6,564.9l-61.9-10.7l0.5-2.8c0.3-1.5,1.7-2.5,3.2-2.3
l56.4,9.8c1.5,0.3,2.5,1.7,2.3,3.2L871.6,564.9z"/>
<path id="path110" inkscape:connector-curvature="0" class="st13" d="M874.4,589.5l-76-13.2l-0.4,2.5c-0.3,1.7,0.8,3.3,2.5,3.6
l69.8,12c1.7,0.3,3.3-0.8,3.6-2.5L874.4,589.5z"/>
<path id="path112" inkscape:connector-curvature="0" class="st14" d="M860.3,594.3l-51.1-8.8l-0.5,3.1c-0.3,1.4,0.7,2.7,2.1,3l46,8
c1.4,0.3,2.7-0.7,3-2.1L860.3,594.3z"/>
<path id="path114" inkscape:connector-curvature="0" class="st12" d="M863.2,556.2l-44.2-7.6l2.3-13.3c1.3-7.3,8.3-12.3,15.6-11
l17.5,3.1c7.3,1.3,12.3,8.3,11,15.6L863.2,556.2z"/>
<line id="line116" class="st12" x1="848.1" y1="516.1" x2="846.6" y2="524.6"/>
<line id="line118" class="st12" x1="840.5" y1="533.7" x2="829.8" y2="543.5"/>
<line id="line120" class="st12" x1="854.8" y1="536.6" x2="844.2" y2="546.3"/>
<path id="path122" inkscape:connector-curvature="0" class="st13" d="M820.1,594.8l-6.8,8.7c-0.3,0.4,0.2,1,0.7,0.8l13.7-7.6
c0.4-0.3,0.3-0.8-0.2-0.9l-7-1.1C820.5,594.6,820.3,594.7,820.1,594.8L820.1,594.8z"/>
<path id="path124" inkscape:connector-curvature="0" class="st13" d="M810,593l-8.9,7c-0.4,0.3-0.1,1.1,0.5,0.8l15.5-5.9
c0.5-0.2,0.4-0.8-0.1-1l-6.6-1.2C810.3,592.8,810.1,592.9,810,593L810,593z"/>
<path id="path126" inkscape:connector-curvature="0" class="st13" d="M846.4,599.1l3.5,10.5c0.2,0.5-0.5,0.9-0.8,0.5l-10.4-11.8
c-0.3-0.3,0-0.9,0.5-0.8l6.9,1.3C846.3,598.9,846.4,599,846.4,599.1L846.4,599.1z"/>
<path id="path128" inkscape:connector-curvature="0" class="st13" d="M856.6,600.8l6,9.6c0.3,0.5-0.3,1-0.8,0.7l-12.7-10.7
c-0.4-0.3-0.1-1,0.4-0.8l6.6,1.2C856.4,600.6,856.5,600.7,856.6,600.8L856.6,600.8z"/>
</g>
<g id="g166" transform="translate(6.564267,-535.67492)" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476">
<path id="path132" inkscape:connector-curvature="0" class="st15" d="M331.5,664.9l-70.2,18.3c-5.1,1.3-10.4-1.8-11.7-6.9l0,0
c-1.3-5.1,1.8-10.4,6.9-11.7l70.2-18.3c5.1-1.3,10.4,1.8,11.7,6.9l0,0C339.7,658.2,336.7,663.6,331.5,664.9z"/>
<circle id="circle134" class="st16" cx="264.1" cy="672.3" r="5.3"/>
<circle id="circle136" class="st16" cx="284.7" cy="667" r="5.3"/>
<circle id="circle138" class="st17" cx="279.7" cy="609.2" r="3.7"/>
<circle id="circle140" class="st16" cx="305.5" cy="661.5" r="5.3"/>
<circle id="circle142" class="st16" cx="325.8" cy="656.1" r="5.3"/>
<path id="path144" inkscape:connector-curvature="0" class="st18" d="M319.7,646.5L263,661.3l-0.7-2.5c-0.4-1.4,0.5-2.8,1.9-3.2
l51.6-13.4c1.4-0.4,2.8,0.5,3.2,1.9L319.7,646.5z"/>
<path id="path146" inkscape:connector-curvature="0" class="st18" d="M331.5,666.3L262,684.4l0.6,2.2c0.4,1.6,2,2.5,3.5,2.1
L330,672c1.6-0.4,2.5-2,2.1-3.5L331.5,666.3z"/>
<path id="path148" inkscape:connector-curvature="0" class="st14" d="M321.5,675.8L274.8,688l0.7,2.8c0.4,1.3,1.7,2.1,2.9,1.7
l42.1-11c1.3-0.4,2.1-1.7,1.7-2.9L321.5,675.8z"/>
<path id="path150" inkscape:connector-curvature="0" class="st17" d="M309.3,642.2l-40.5,10.5l-3.2-12.2c-1.8-6.7,2.3-13.6,9-15.4
l16-4.2c6.7-1.8,13.6,2.3,15.4,9L309.3,642.2z"/>
<line id="line152" class="st17" x1="281" y1="614.1" x2="283.1" y2="621.9"/>
<line id="line154" class="st17" x1="281.4" y1="631.9" x2="276.1" y2="644.3"/>
<line id="line156" class="st17" x1="294.6" y1="628.9" x2="289.3" y2="641.3"/>
<path id="path158" inkscape:connector-curvature="0" class="st18" d="M287.6,691.6l-2.3,10.1c-0.1,0.5,0.6,0.8,0.8,0.4l8.7-11.7
c0.3-0.4-0.1-0.8-0.5-0.7l-6.3,1.8C287.8,691.3,287.6,691.4,287.6,691.6L287.6,691.6z"/>
<path id="path160" inkscape:connector-curvature="0" class="st18" d="M278.3,693.9l-4.8,9.3c-0.3,0.5,0.4,0.9,0.7,0.6l10.9-10.9
c0.4-0.4,0-0.9-0.5-0.8l-6.1,1.6C278.5,693.8,278.4,693.8,278.3,693.9L278.3,693.9z"/>
<path id="path162" inkscape:connector-curvature="0" class="st18" d="M311.6,685.2l7,7.6c0.4,0.4-0.1,0.9-0.6,0.7l-13.3-6.1
c-0.4-0.2-0.4-0.7,0.1-0.9l6.3-1.6C311.3,685,311.5,685.1,311.6,685.2z"/>
<path id="path164" inkscape:connector-curvature="0" class="st18" d="M320.8,682.7l8.8,5.8c0.5,0.3,0.1,1-0.4,0.8l-14.9-4.2
c-0.5-0.1-0.5-0.8,0-0.9l6.1-1.6C320.6,682.6,320.7,682.6,320.8,682.7z"/>
</g>
<path id="path168" inkscape:connector-curvature="0" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st19" d="
M241.1,524.1h-1c0.4-1.2,0.6-2.4,0.6-3.7c0-7-5.7-12.7-12.7-12.7h-3.3c-2.7,0-5.2,0.9-7.2,2.3c-1.2-0.8-2.7-1.3-4.3-1.3
c-0.7,0-1.4,0.1-2.1,0.3c-0.1-3.9-3.3-7.1-7.2-7.1h-0.3c-2.2,0-4.1,1-5.4,2.5l0,0c2.7-3.8,4.4-8.5,4.4-13.6
c0-13-10.6-23.5-23.5-23.5c-13,0-23.5,10.6-23.5,23.5c0,0.7,0,1.3,0.1,1.9c-2.9,0.3-5.7,1.3-8,2.9c-3.2-4.3-8.2-7.1-13.9-7.1l0,0
c-9.5,0-17.3,7.8-17.3,17.3c0,0.1,0,0.3,0,0.4c-1.8-0.9-3.7-1.5-5.9-1.5c-6.9,0-12.6,5.7-12.6,12.6c0,0.7,0.1,1.5,0.2,2.2H92
c-3,0-5.5,1.9-6.5,4.5h-9.9c-2.5,0-4.5,2-4.5,4.5s2,4.5,4.5,4.5h149.2h3.3h13.1c2.5,0,4.5-2,4.5-4.5
C245.6,526.1,243.6,524.1,241.1,524.1L241.1,524.1z"/>
<path id="path170" inkscape:connector-curvature="0" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st19" d="
M898.4,516.3h-0.9c0.3-1,0.5-2.1,0.5-3.3c0-6.2-5.1-11.3-11.3-11.3h-2.9c-2.4,0-4.6,0.8-6.4,2c-1.1-0.7-2.4-1.1-3.8-1.1
c-0.6,0-1.2,0.1-1.8,0.3c-0.1-3.5-2.9-6.3-6.4-6.3H865c-1.9,0-3.6,0.9-4.8,2.2l0,0c2.4-3.4,3.9-7.6,3.9-12
c0-11.5-9.4-20.9-20.9-20.9s-20.9,9.4-20.9,20.9c0,0.6,0,1.2,0.1,1.7c-2.6,0.2-5,1.2-7.1,2.6c-2.8-3.8-7.3-6.3-12.3-6.3l0,0
c-8.5,0-15.4,6.9-15.4,15.4c0,0.1,0,0.2,0,0.4c-1.6-0.8-3.3-1.4-5.2-1.4c-6.2,0-11.2,5-11.2,11.2c0,0.7,0.1,1.3,0.2,1.9h-5.5
c-2.6,0-4.9,1.7-5.8,4h-8.8c-2.2,0-4,1.8-4,4s1.8,4,4,4h132.5h2.9h11.6c2.2,0,4-1.8,4-4C902.4,518.1,900.6,516.3,898.4,516.3
L898.4,516.3z"/>
<g id="g184" transform="translate(10.641067,-115.56078)" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476">
<g id="g178">
<path id="path172" inkscape:connector-curvature="0" class="st20" d="M149.5,586c2.4,4.6,6.2,8.3,10.8,10.8c3.6,1.9,7.6,3,11.9,3
c14.2,0,25.7-11.5,25.7-25.7c0-4.3-1.1-8.4-3-11.9c-2.4-4.6-6.2-8.3-10.8-10.8c-3.6-1.9-7.6-3-11.9-3c-14.2,0-25.7,11.5-25.7,25.7
C146.6,578.4,147.7,582.4,149.5,586z"/>
<path id="path174" inkscape:connector-curvature="0" class="st19" d="M194.1,562.5c-2.3-4.4-6-8.1-10.4-10.4
c-3.4-1.8-7.4-2.9-11.5-2.9c-9.5,0-17.7,5.3-21.8,13.1c4-2.8,8.9-4.4,14.1-4.4c4.2,0,8.1,1,11.5,2.9c4.4,2.3,8.1,6,10.4,10.4
c1.8,3.4,2.9,7.4,2.9,11.5c0,4.2-1.1,8.2-2.9,11.7c6.4-4.5,10.6-11.9,10.6-20.3C197,569.8,196,565.9,194.1,562.5L194.1,562.5z"/>
<path id="path176" inkscape:connector-curvature="0" class="st21" d="M149.5,586c-7.7,10-11.6,17.8-9.3,20.1s10.1-1.6,20.1-9.3
c5.6-4.4,12-9.9,18.3-16.3c6.4-6.4,11.9-12.7,16.3-18.3c7.7-10,11.6-17.8,9.3-20.1s-10.1,1.6-20.1,9.3"/>
</g>
<path id="path180" inkscape:connector-curvature="0" class="st22" d="M154,566.3c0.5-1.2,1.1-2.3,1.8-3.4c0.7-1.1,1.5-2,2.4-2.9
s1.9-1.7,2.9-2.4c1.1-0.7,2.2-1.3,3.4-1.8s2.4-0.9,3.7-1.2s2.6-0.4,4-0.4"/>
<path id="path182" inkscape:connector-curvature="0" class="st22" d="M152.4,574.1c0-1.4,0.1-2.7,0.4-4"/>
</g>
<g id="g192" transform="translate(-0.2304,235.22748)" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476">
<polygon id="polygon186" class="st20" points="843.4,216.1 850,204.1 856.6,216.1 868.7,222.7 856.6,229.3 850,241.4 843.4,229.3
831.4,222.7 "/>
<polygon id="polygon188" class="st20" points="868.4,248.1 873.4,239.1 878.3,248.1 887.4,253.1 878.3,258 873.4,267.1 868.4,258
859.4,253.1 "/>
<polygon id="polygon190" class="st20" points="884.1,207.8 887.4,201.7 890.7,207.8 896.7,211.1 890.7,214.4 887.4,220.4
884.1,214.4 878,211.1 "/>
</g>
<g inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st23">
<path class="st24" d="M331.6,348.5l65.5-89.9h24.1v89.6h17.5V369h-17.5v28.8h-27.9V369h-61.7V348.5z M357.8,348.1h35.6v-34.4
c0-5.9,0.2-11.6,0.8-17h-0.9c-4.9,7.8-8,12.7-9.4,14.6L357.8,348.1z"/>
<path class="st24" d="M449.2,328.2c0-11.1,1.1-21,3.2-29.6s4.9-15.5,8.2-20.7c3.3-5.2,7.3-9.4,11.9-12.8c4.6-3.3,9.1-5.6,13.5-6.8
c4.4-1.2,9-1.8,13.7-1.8c16.3,0,28.8,6.4,37.5,19.1c8.7,12.7,13.1,30.3,13.1,52.6c0,22.1-4.4,39.6-13.1,52.4s-21.2,19.2-37.4,19.2
c-4.5,0-8.9-0.6-13.2-1.7c-4.3-1.1-8.7-3.3-13.4-6.6c-4.6-3.2-8.7-7.4-12.1-12.5c-3.5-5.1-6.3-12-8.6-20.8
C450.4,349.6,449.2,339.6,449.2,328.2z M478.2,328.2c0,33.8,7.2,50.7,21.6,50.7c14.2,0,21.3-16.9,21.3-50.7s-7.2-50.7-21.5-50.7
C485.4,277.5,478.2,294.4,478.2,328.2z"/>
<path class="st24" d="M561.2,348.5l65.5-89.9h24.1v89.6h17.5V369h-17.5v28.8h-27.8V369h-61.7L561.2,348.5L561.2,348.5z
M587.3,348.1h35.6v-34.4c0-5.9,0.2-11.6,0.8-17h-0.9c-4.9,7.8-8,12.7-9.4,14.6L587.3,348.1z"/>
</g>
<g inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476">
<path class="st24" d="M437.5,201.7c0-5.7,1.4-10.2,4.3-13.7c2.8-3.5,6.6-5.2,11.3-5.2s8.5,1.7,11.3,5.2c2.8,3.5,4.2,8,4.2,13.7
s-1.4,10.2-4.2,13.6c-2.8,3.5-6.6,5.1-11.3,5.1s-8.5-1.7-11.3-5.1C438.9,211.9,437.5,207.4,437.5,201.7z M444.4,201.7
c0,4,0.8,7.2,2.3,9.6c1.6,2.4,3.7,3.5,6.4,3.5c2.7,0,4.8-1.2,6.4-3.5c1.6-2.4,2.3-5.5,2.3-9.6c0-4-0.8-7.2-2.3-9.6
c-1.5-2.4-3.6-3.5-6.4-3.5c-2.7,0-4.9,1.2-6.4,3.5C445.2,194.5,444.4,197.7,444.4,201.7z"/>
<path class="st24" d="M471.9,206.5c0-3.9,1-7.2,3.1-9.9c2-2.7,5.1-4.1,9.2-4.1c2.7,0,5,0.7,7,2c1.9,1.3,3.3,3.1,4.1,5.1
s1.2,4.3,1.2,6.8c0,1.2-0.1,2.4-0.3,3.5c-0.2,1.2-0.6,2.4-1.2,3.7c-0.6,1.3-1.3,2.4-2.2,3.4c-0.9,0.9-2,1.7-3.5,2.4
c-1.5,0.6-3.1,0.9-5,0.9s-3.5-0.3-4.9-0.9c-1.5-0.6-2.7-1.3-3.5-2.3c-0.9-0.9-1.6-2-2.2-3.3c-0.5-1.3-1-2.5-1.2-3.7
C472,209.1,471.9,207.8,471.9,206.5z M478.4,206.5c0,3,0.6,5.1,1.7,6.4s2.4,2,3.9,2s2.7-0.7,3.9-2c1.2-1.3,1.7-3.5,1.7-6.4
c0-3-0.5-5.1-1.7-6.5c-1.2-1.3-2.5-2-4-2c-1.5,0-2.8,0.7-3.9,2C479.1,201.4,478.4,203.6,478.4,206.5z"/>
<path class="st24" d="M501.1,230.2v-36.8h6.1v3c1.6-2.5,3.8-3.8,6.6-3.8c3.2,0,5.9,1.3,8,3.8c2.1,2.5,3.2,5.9,3.2,10.2
c0,2.4-0.3,4.6-1,6.4c-0.7,1.9-1.6,3.3-2.7,4.4c-1.1,1-2.4,1.9-3.6,2.4c-1.3,0.5-2.5,0.8-3.8,0.8c-1.6,0-2.9-0.3-4-1
c-1-0.7-1.8-1.5-2.4-2.4v13.2h-6.4V230.2z M507.3,206.5c0,2.6,0.5,4.7,1.4,6.2c0.9,1.6,2.4,2.3,4.1,2.3c1.6,0,3-0.7,4-2.2
c1-1.5,1.5-3.6,1.5-6.3c0-2.6-0.5-4.7-1.5-6.2s-2.4-2.3-4.1-2.3c-1.7,0-3.1,0.8-4.1,2.4C507.8,201.9,507.3,203.9,507.3,206.5z"/>
<path class="st24" d="M527.5,214.1l4.5-2.7c1.8,2.6,4.1,3.9,6.9,3.9c1.3,0,2.3-0.2,3-0.8c0.7-0.6,1.1-1.2,1.1-2
c0-0.2,0-0.5-0.1-0.7s-0.2-0.5-0.4-0.6c-0.2-0.2-0.3-0.4-0.5-0.5c-0.2-0.2-0.4-0.3-0.7-0.5c-0.3-0.2-0.6-0.3-0.9-0.4
c-0.2-0.1-0.5-0.2-0.9-0.4c-0.5-0.2-0.8-0.3-1-0.4c-0.2-0.1-0.6-0.2-1.2-0.4c-0.6-0.2-0.9-0.3-1.2-0.4c-2.3-0.7-4.1-1.7-5.4-2.8
c-1.3-1.1-2-2.8-2-5c0-2.4,1-4.2,3.1-5.7c2-1.4,4.5-2.1,7.3-2.1c2.2,0,4.2,0.6,6.2,1.7c2,1.1,3.4,2.6,4.3,4.4l-4.2,2.5
c-1.9-2.4-3.9-3.5-6.3-3.5c-1.3,0-2.2,0.2-2.8,0.7c-0.7,0.5-1,1.1-1,2c0,0.2,0,0.5,0.1,0.8c0.1,0.2,0.2,0.5,0.4,0.7
c0.2,0.2,0.3,0.4,0.6,0.5c0.2,0.2,0.4,0.3,0.7,0.5c0.3,0.2,0.5,0.3,0.8,0.4c0.2,0.1,0.5,0.2,0.9,0.4c0.4,0.2,0.7,0.2,0.9,0.3
c0.2,0.1,0.6,0.2,1.1,0.4c0.5,0.2,0.8,0.3,1,0.3c5.2,1.7,7.8,4.3,7.8,7.9c0,2.2-0.9,4.1-2.7,5.6c-1.8,1.6-4.4,2.3-7.7,2.3
c-2.6,0-4.9-0.5-6.9-1.7C530.1,217.5,528.6,216,527.5,214.1z"/>
<path class="st24" d="M554.3,219.8v-7.3h8.2v7.3H554.3z M561.5,209.4h-6.2l-0.5-25.9h7.1L561.5,209.4z"/>
</g>
<g>
<path class="st24" d="M366.2,473c2.2,0,2.6-1.1,2.9-7.7c1.1,0.9,3.2,1.7,4.5,2c-0.6,8.2-2,10.3-7,10.3h-8.5c-5.6,0-7.1-1.5-7.1-6.7
V455h-0.4c-2.1,9.6-7.1,18.6-19.8,23.7c-0.6-1.2-2-2.9-3.1-3.9c11.4-4.3,16.1-11.7,18.2-19.8h-17.4v-4.6h18.3
c0.5-3.3,0.6-6.7,0.8-9.9h-16V436h38.9v4.5h-18.2c-0.2,3.2-0.3,6.6-0.8,9.9h21.7v4.6h-17.3v15.8c0,1.8,0.4,2.2,2.8,2.2H366.2z"/>
<path class="st24" d="M387,455.6c-2-1.7-6-4-9.2-5.4l2.5-3.5c3.1,1.2,7.3,3.4,9.3,5L387,455.6z M379.5,475
c2.7-3.8,6.6-10.2,9.5-15.9l3.4,3.1c-2.6,5.2-5.9,11.2-8.9,15.9L379.5,475z M383.2,433.4c3.2,1.3,7.3,3.5,9.4,5.1l-2.7,3.8
c-2-1.8-6.1-4.1-9.3-5.5L383.2,433.4z M418.7,478.7c-0.4-1.4-1.1-3-2.1-4.7c-17.7,2.1-19.9,2.5-21.4,3.2c-0.2-1-1-3.4-1.6-4.7
c1.1-0.2,2-1.4,3.2-3.2c1.1-1.4,3.9-6.3,6.2-11.6h-10.3v-4.4H405v-8.1h-10.3v-4.4H405v-8h4.8v8h10.7v4.4h-10.7v8.1h12.6v4.4h-13.8
c-2.3,5-5.2,9.9-8,13.9l13.9-1.5c-1.2-2-2.6-4-3.9-5.9l3.8-1.8c3.5,4.4,7.1,10.2,8.6,14.1L418.7,478.7z"/>
<path class="st24" d="M445.5,458.3c-2.1,0.6-4.2,1.2-6.3,1.8v13.1c0,2.4-0.5,3.6-2,4.2c-1.5,0.8-3.7,0.8-7.1,0.8
c-0.1-1.1-0.8-3-1.4-4.2c2.2,0.1,4.3,0.1,4.9,0.1c0.7-0.1,1-0.2,1-1v-11.8l-6.1,1.6l-1.2-4.5c2-0.5,4.6-1,7.3-1.8V447h-6.7v-4.3
h6.7v-9.8h4.6v9.8h5.8v4.3h-5.8v8.7c1.9-0.5,3.8-1,5.7-1.5L445.5,458.3z M470.4,453.5c-2.1,4.6-5.2,9.1-8.8,13.1
c1.6,3.8,3.7,6.1,6.1,6.4c1.2,0.1,2-2.5,2.4-7.9c0.9,1.1,3,2.2,3.9,2.7c-1.2,8.6-3.5,11-6.1,10.8c-4.4-0.4-7.7-3.4-10.1-8.3
c-3.2,2.8-6.8,5.2-10.4,6.9c-0.9-1.2-2.4-2.7-3.8-3.6c4.5-1.8,8.7-4.5,12.3-7.9c-1.4-4.2-2.3-9.4-3-15.3l-7,0.7l-0.5-4.5l7.1-0.7
c-0.3-4-0.6-8.3-0.8-12.8h4.7c0.1,4.3,0.4,8.4,0.8,12.2l15.1-1.5l0.5,4.4l-15.1,1.5c0.5,4.5,1.3,8.5,2.2,11.9
c2.8-3.3,5.1-6.9,6.7-10.5L470.4,453.5z M465.8,443.6c-1.3-2.1-4.3-5.2-6.6-7.5l3.5-2.3c2.4,2.1,5.4,5.2,6.9,7.1L465.8,443.6z"/>
<path class="st24" d="M494.2,469.6c3.3-0.6,6.7-1.2,10-1.8l0.2,4c-8.8,1.7-18.2,3.5-24.8,4.7l-1-4.4c3-0.4,7-1,11.2-1.8v-7.7h-9.5
v-4.1h9.5v-5h4.4v5h9.5v4.1h-9.5L494.2,469.6L494.2,469.6z M479.2,436h25.7v4.1h-13.4c-1.5,2.9-3.3,5.8-5.1,8.3l11.1-0.8
c-1.1-1.6-2.2-3.2-3.3-4.6l3.4-2c2.7,3.3,5.9,7.8,7.3,10.7l-3.6,2.3c-0.4-0.8-0.9-1.8-1.5-2.8c-14.8,1.2-16.7,1.5-18,2
c-0.3-1-1-3.1-1.5-4.2c0.8-0.2,1.6-1,2.4-2.2c0.6-0.9,2.3-3.6,3.7-6.7h-7.2V436z M511.5,467.3h-4.4v-30h4.4V467.3z M521.4,433.6
v38.7c0,2.6-0.6,4-2.2,4.7c-1.7,0.8-4.4,0.8-8.2,0.8c-0.2-1.2-1-3.3-1.7-4.5c2.8,0.1,5.6,0.1,6.4,0c0.8,0,1.1-0.2,1.1-1.1v-38.7
h4.6V433.6z"/>
<path class="st24" d="M544.4,466.2c0.3,1.3,1,3.2,1.4,4.1c-8.1,5.9-9.5,7-10.2,8c-0.5-1-1.8-2.6-2.5-3.3c1-0.8,2.5-2.6,2.5-5.2
v-17.2H528v-4.5h12v21.2L544.4,466.2z M537.7,444.6c-1.3-2.3-4.2-6-6.6-8.8l3.4-2.6c2.4,2.6,5.5,6.2,6.8,8.5L537.7,444.6z
M572.8,457.7c-2.4,3.7-5.4,7.1-8.9,10.1c3.2,2.5,6.9,5.5,9,7.5l-3.7,3c-1.8-2.1-5.4-5.2-8.6-7.8c-4.8,3.5-10.2,6.4-16.1,8.5
c-0.7-1.2-2-3-2.8-4c11.7-3.8,21.6-11.1,26.3-19.2L572.8,457.7z M569.2,448.1c-4.8,8.2-13,15.5-22.8,19.8c-0.6-1.1-1.9-2.8-2.7-3.7
c5.4-2.2,10.4-5.5,14.3-9.2c-9.3,0.7-10.9,1-11.9,1.3c-0.2-1.1-1-3.4-1.5-4.4c1.2-0.2,2.9-0.7,3.9-1.6s3.5-4.6,5.4-7.3h-10.2v-4.2
H557c-0.5-1.6-1.3-3.6-2.2-5.2l4.5-1.1c1.2,1.9,2.2,4.4,2.8,6.2h10.6v4.2H559c-1.7,2.6-3.9,5.8-5.8,8.3l8.4-0.5
c1.2-1.4,2.1-3,2.9-4.4L569.2,448.1z"/>
<path class="st24" d="M621.2,472.9c0,2.8-0.7,4.2-2.5,5s-4.8,0.9-9.3,0.9c-0.3-1.3-1.1-3.5-1.8-4.7c3.1,0.2,6.6,0.1,7.6,0.1
s1.3-0.3,1.3-1.3v-32.7h-32V470c3.2-3.2,5.7-7.5,7.4-12.6c-2.2-2.9-4.6-5.9-6.7-8.5l2.7-2.7c1.8,2,3.6,4.2,5.3,6.5
c0.8-3,1.5-6.3,1.9-9.7l4.2,0.5c-0.8,4.8-1.7,9.2-2.9,13.3c1.9,2.5,3.6,5,4.7,6.9l-3,3.2c-0.8-1.6-2-3.5-3.4-5.5
c-1.8,4.5-4,8.3-6.8,11.3c-0.7-0.6-2.4-1.8-3.4-2.4v8.4H580v-42.9h41.3V472.9z M612.2,468.2c-0.8-1.9-2-4.2-3.5-6.5
c-1.8,4.5-4.2,8.5-7.2,11.5c-0.8-0.6-2.6-2-3.6-2.5c3.5-3.2,6.2-7.8,8-13.2c-2-3-4.2-5.9-6.5-8.5l2.9-2.5c1.7,1.8,3.4,4,5,6.1
c0.7-3,1.2-6.1,1.7-9.4l4.2,0.5c-0.7,4.8-1.5,9.2-2.8,13.3c2.1,3,3.8,6,5,8.5L612.2,468.2z"/>
<path class="st24" d="M653.2,461c0,7-3.2,13.8-22.2,17.9c-0.5-1.1-1.8-2.8-2.9-3.8c17.8-3.5,20.2-9.1,20.2-14.1v-8.9h4.8v8.9H653.2
z M645.4,445.1c0.7-1.6,1.4-3.6,1.9-5.4h-17.9v-4.3h42.8v4.3h-19.7c-0.8,1.9-1.7,3.8-2.5,5.4h18V468h-5v-18.6h-24.4v18.8h-4.8
v-23.1L645.4,445.1L645.4,445.1z M655.2,466.4c5.6,2.3,13.2,6.2,17.1,8.9l-2.9,3.7c-3.6-2.8-11.2-6.8-16.9-9.4L655.2,466.4z"/>
</g>
</svg>

After

Width:  |  Height:  |  Size: 24 KiB

260
docs/_static/404-page__en.svg vendored Normal file
View File

@ -0,0 +1,260 @@
<?xml version="1.0" encoding="utf-8"?>
<!-- Generator: Adobe Illustrator 23.0.2, SVG Export Plug-In . SVG Version: 6.00 Build 0) -->
<svg version="1.1"
id="图层_1" xmlns:cc="http://creativecommons.org/ns#" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:inkscape="http://www.inkscape.org/namespaces/inkscape" xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:sodipodi="http://sodipodi.sourceforge.net/DTD/sodipodi-0.dtd" xmlns:svg="http://www.w3.org/2000/svg"
xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" x="0px" y="0px" viewBox="0 0 1000 580"
style="enable-background:new 0 0 1000 580;" xml:space="preserve">
<style type="text/css">
.st0{fill:#FFFFFF;}
.st1{fill:url(#polygon12_1_);}
.st2{fill:url(#polygon21_1_);}
.st3{opacity:0.27;fill:url(#circle42_1_);enable-background:new ;}
.st4{fill:url(#polygon58_1_);}
.st5{fill:#444444;stroke:#FFFFFF;stroke-width:0.834;stroke-miterlimit:10;}
.st6{fill:none;stroke:#FFFFFF;stroke-width:1.1033;stroke-miterlimit:10;}
.st7{fill:none;stroke:#353535;stroke-width:1.1033;stroke-miterlimit:10;}
.st8{fill:#FFFFFF;stroke:#444444;stroke-width:0.834;stroke-miterlimit:10;}
.st9{fill:#444444;stroke:#FFFFFF;stroke-width:0.8485;stroke-miterlimit:10;}
.st10{fill:none;stroke:#FFFFFF;stroke-width:1.1226;stroke-miterlimit:10;}
.st11{fill:none;stroke:#353535;stroke-width:1.1226;stroke-miterlimit:10;}
.st12{fill:#FFFFFF;stroke:#444444;stroke-width:0.8485;stroke-miterlimit:10;}
.st13{fill:#353535;}
.st14{fill:#444444;stroke:#FFFFFF;stroke-width:0.9321;stroke-miterlimit:10;}
.st15{fill:none;stroke:#FFFFFF;stroke-width:1.046;stroke-miterlimit:10;}
.st16{fill:none;stroke:#353535;stroke-width:1.046;stroke-miterlimit:10;}
.st17{fill:#FFFFFF;stroke:#444444;stroke-width:0.7906;stroke-miterlimit:10;}
.st18{opacity:0.59;fill:#E0E0E0;enable-background:new ;}
.st19{fill:#FFFFFF;stroke:#444444;stroke-width:2;stroke-miterlimit:10;}
.st20{fill:none;stroke:#444444;stroke-width:2;stroke-miterlimit:10;}
.st21{fill:none;stroke:#444444;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:10;}
.st22{enable-background:new ;}
.st23{fill:#4D4D4D;}
</style>
<rect id="BG_2_" x="-1" y="-9.5" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st0" width="1012.9" height="600.4">
</rect>
<linearGradient id="polygon12_1_" gradientUnits="userSpaceOnUse" x1="1014.7582" y1="90.2012" x2="1077.5918" y2="356.7023" gradientTransform="matrix(0.9556 0.295 -0.2974 0.9605 -400.3649 -336.724)">
<stop offset="4.835800e-02" style="stop-color:#9FA0A0"/>
<stop offset="0.5227" style="stop-color:#D7D8D8;stop-opacity:0.4381"/>
<stop offset="0.8926" style="stop-color:#FFFFFF;stop-opacity:0"/>
</linearGradient>
<polygon id="polygon12" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st1" points="608.7,371.9
511.9,134.7 490.9,134.1 430.7,377.1 ">
</polygon>
<linearGradient id="polygon21_1_" gradientUnits="userSpaceOnUse" x1="197.9478" y1="434.8972" x2="282.0578" y2="791.6389" gradientTransform="matrix(0.9983 -5.887031e-02 5.887031e-02 0.9983 28.0536 -430.7623)">
<stop offset="4.835800e-02" style="stop-color:#898989"/>
<stop offset="0.5874" style="stop-color:#D7D7D7;stop-opacity:0.3616"/>
<stop offset="0.8926" style="stop-color:#FFFFFF;stop-opacity:0"/>
</linearGradient>
<polygon id="polygon21" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st2" points="325.4,155.1
289.2,163.9 291.9,437 480.9,450.4 ">
</polygon>
<radialGradient id="circle42_1_" cx="836.3" cy="506.5986" r="65.7125" gradientTransform="matrix(1 0 0 1 12 -19.0997)" gradientUnits="userSpaceOnUse">
<stop offset="0" style="stop-color:#FFFFFF"/>
<stop offset="1" style="stop-color:#FFFFFF;stop-opacity:0"/>
</radialGradient>
<circle id="circle42" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st3" cx="848.3" cy="487.5" r="68.3">
</circle>
<linearGradient id="polygon58_1_" gradientUnits="userSpaceOnUse" x1="1863.538" y1="415.4688" x2="1929.7196" y2="696.1702" gradientTransform="matrix(0.8607 0.5092 -0.5092 0.8607 -635.7186 -1225.6498)">
<stop offset="4.835800e-02" style="stop-color:#898989"/>
<stop offset="0.5874" style="stop-color:#D7D7D7;stop-opacity:0.3616"/>
<stop offset="0.8926" style="stop-color:#FFFFFF;stop-opacity:0"/>
</linearGradient>
<polygon id="polygon58" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st4" points="547.4,383
763.1,428.2 713.7,163.4 683.8,156 ">
</polygon>
<g id="g94" transform="rotate(9.0573675,796.06564,263.99283)" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476">
<path id="path60" inkscape:connector-curvature="0" class="st5" d="M520,160.2l-75.8,10.4c-5.6,0.8-10.8-3.2-11.5-8.7l0,0
c-0.8-5.6,3.2-10.8,8.7-11.5l75.8-10.4c5.6-0.8,10.8,3.2,11.5,8.7l0,0C529.5,154.3,525.6,159.4,520,160.2L520,160.2z"/>
<ellipse id="circle62" transform="matrix(0.1574 -0.9875 0.9875 0.1574 220.2397 577.3303)" class="st6" cx="448.4" cy="159.6" rx="5.6" ry="5.6"/>
<ellipse id="circle64" transform="matrix(0.1574 -0.9875 0.9875 0.1574 241.8959 596.7076)" class="st6" cx="470.6" cy="156.6" rx="5.6" ry="5.6"/>
<ellipse id="circle66" transform="matrix(0.1574 -0.9875 0.9875 0.1574 304.0202 547.2599)" class="st7" cx="472.7" cy="95.5" rx="3.9" ry="3.9"/>
<ellipse id="circle68" transform="matrix(0.1574 -0.9875 0.9875 0.1574 263.9857 616.4264)" class="st6" cx="493.2" cy="153.5" rx="5.6" ry="5.6"/>
<ellipse id="circle70" transform="matrix(0.1574 -0.9875 0.9875 0.1574 285.6079 635.2634)" class="st6" cx="515.1" cy="150.3" rx="5.6" ry="5.6"/>
<path id="path72" inkscape:connector-curvature="0" class="st8" d="M510,139.5l-61.3,8.4l-0.4-2.8c-0.2-1.5,0.8-2.9,2.3-3.1
l55.7-7.7c1.5-0.2,2.9,0.8,3.1,2.3L510,139.5z"/>
<path id="path74" inkscape:connector-curvature="0" class="st8" d="M519.9,161.7L444.9,172l0.3,2.4c0.3,1.7,1.8,2.8,3.4,2.6
l69.1-9.5c1.7-0.3,2.8-1.8,2.6-3.4L519.9,161.7z"/>
<path id="path76" inkscape:connector-curvature="0" class="st7" d="M508.2,170.4l-50.5,6.9l0.4,3c0.2,1.3,1.5,2.3,2.8,2.2l45.5-6.3
c1.3-0.2,2.3-1.5,2.2-2.8L508.2,170.4z"/>
<path id="path78" inkscape:connector-curvature="0" class="st7" d="M499.5,133.7l-43.7,6l-1.8-13.2c-1-7.3,4.1-14,11.3-15l17.3-2.4
c7.3-1,14,4.1,15,11.3L499.5,133.7z"/>
<line id="line80" class="st7" x1="473.5" y1="100.6" x2="474.6" y2="109.1"/>
<line id="line82" class="st7" x1="471.6" y1="119.4" x2="464.5" y2="131.7"/>
<line id="line84" class="st7" x1="485.9" y1="117.9" x2="478.7" y2="130.2"/>
<path id="path86" inkscape:connector-curvature="0" class="st8" d="M470.7,182.8L467,193c-0.2,0.5,0.5,0.9,0.8,0.5l10.5-11.3
c0.3-0.3,0-0.9-0.4-0.8l-6.8,1C471,182.5,470.8,182.6,470.7,182.8z"/>
<path id="path88" inkscape:connector-curvature="0" class="st8" d="M460.7,184l-6.3,9.2c-0.3,0.5,0.3,1,0.8,0.7l12.8-10.1
c0.4-0.3,0.2-0.9-0.4-0.9l-6.6,0.9C460.9,183.9,460.8,184,460.7,184L460.7,184z"/>
<path id="path90" inkscape:connector-curvature="0" class="st8" d="M496.6,179l6.3,8.8c0.3,0.4-0.2,1-0.7,0.8l-13.2-8
c-0.4-0.3-0.3-0.8,0.2-0.9l6.8-0.8C496.3,178.8,496.5,178.9,496.6,179L496.6,179z"/>
<path id="path92" inkscape:connector-curvature="0" class="st8" d="M506.7,177.5l8.5,7.2c0.4,0.3,0,1.1-0.5,0.8l-15.1-6.3
c-0.5-0.2-0.4-0.9,0.1-0.9l6.6-0.9C506.4,177.4,506.5,177.4,506.7,177.5L506.7,177.5z"/>
</g>
<g id="g130" transform="translate(-131.09867,-443.26745)" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476">
<path id="path96" inkscape:connector-curvature="0" class="st9" d="M875,594.1l-76.7-13.2c-5.6-0.9-9.4-6.4-8.5-12l0,0
c0.9-5.6,6.4-9.4,12-8.5l76.7,13.2c5.6,0.9,9.4,6.4,8.5,12l0,0C885.9,591.3,880.6,595.1,875,594.1z"/>
<circle id="circle98" class="st10" cx="805.6" cy="571.5" r="5.7"/>
<circle id="circle100" class="st10" cx="828.2" cy="575.4" r="5.7"/>
<circle id="circle102" class="st11" cx="849" cy="516.8" r="4"/>
<circle id="circle104" class="st10" cx="851" cy="579.3" r="5.7"/>
<circle id="circle106" class="st10" cx="873.2" cy="583" r="5.7"/>
<path id="path108" inkscape:connector-curvature="0" class="st12" d="M871.6,570.9l-61.9-10.7l0.5-2.8c0.3-1.5,1.7-2.5,3.2-2.3
l56.4,9.8c1.5,0.3,2.5,1.7,2.3,3.2L871.6,570.9z"/>
<path id="path110" inkscape:connector-curvature="0" class="st12" d="M874.4,595.5l-76-13.2l-0.4,2.5c-0.3,1.7,0.8,3.3,2.5,3.6
l69.8,12c1.7,0.3,3.3-0.8,3.6-2.5L874.4,595.5z"/>
<path id="path112" inkscape:connector-curvature="0" class="st13" d="M860.3,600.3l-51.1-8.8l-0.5,3.1c-0.3,1.4,0.7,2.7,2.1,3l46,8
c1.4,0.3,2.7-0.7,3-2.1L860.3,600.3z"/>
<path id="path114" inkscape:connector-curvature="0" class="st11" d="M863.2,562.2l-44.2-7.6l2.3-13.3c1.3-7.3,8.3-12.3,15.6-11
l17.5,3.1c7.3,1.3,12.3,8.3,11,15.6L863.2,562.2z"/>
<line id="line116" class="st11" x1="848.1" y1="522.1" x2="846.6" y2="530.6"/>
<line id="line118" class="st11" x1="840.5" y1="539.7" x2="829.8" y2="549.5"/>
<line id="line120" class="st11" x1="854.8" y1="542.6" x2="844.2" y2="552.3"/>
<path id="path122" inkscape:connector-curvature="0" class="st12" d="M820.1,600.8l-6.8,8.7c-0.3,0.4,0.2,1,0.7,0.8l13.7-7.6
c0.4-0.3,0.3-0.8-0.2-0.9l-7-1.1C820.5,600.6,820.3,600.7,820.1,600.8L820.1,600.8z"/>
<path id="path124" inkscape:connector-curvature="0" class="st12" d="M810,599l-8.9,7c-0.4,0.3-0.1,1.1,0.5,0.8l15.5-5.9
c0.5-0.2,0.4-0.8-0.1-1l-6.6-1.2C810.3,598.8,810.1,598.9,810,599L810,599z"/>
<path id="path126" inkscape:connector-curvature="0" class="st12" d="M846.4,605.1l3.5,10.5c0.2,0.5-0.5,0.9-0.8,0.5l-10.4-11.8
c-0.3-0.3,0-0.9,0.5-0.8l6.9,1.3C846.3,604.9,846.4,605,846.4,605.1L846.4,605.1z"/>
<path id="path128" inkscape:connector-curvature="0" class="st12" d="M856.6,606.8l6,9.6c0.3,0.5-0.3,1-0.8,0.7l-12.7-10.7
c-0.4-0.3-0.1-1,0.4-0.8l6.6,1.2C856.4,606.6,856.5,606.7,856.6,606.8L856.6,606.8z"/>
</g>
<g id="g166" transform="translate(6.564267,-535.67492)" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476">
<path id="path132" inkscape:connector-curvature="0" class="st14" d="M331.5,670.9l-70.2,18.3c-5.1,1.3-10.4-1.8-11.7-6.9l0,0
c-1.3-5.1,1.8-10.4,6.9-11.7l70.2-18.3c5.1-1.3,10.4,1.8,11.7,6.9l0,0C339.7,664.2,336.7,669.6,331.5,670.9z"/>
<circle id="circle134" class="st15" cx="264.1" cy="678.3" r="5.3"/>
<circle id="circle136" class="st15" cx="284.7" cy="673" r="5.3"/>
<circle id="circle138" class="st16" cx="279.7" cy="615.2" r="3.7"/>
<circle id="circle140" class="st15" cx="305.5" cy="667.5" r="5.3"/>
<circle id="circle142" class="st15" cx="325.8" cy="662.1" r="5.3"/>
<path id="path144" inkscape:connector-curvature="0" class="st17" d="M319.7,652.5L263,667.3l-0.7-2.5c-0.4-1.4,0.5-2.8,1.9-3.2
l51.6-13.4c1.4-0.4,2.8,0.5,3.2,1.9L319.7,652.5z"/>
<path id="path146" inkscape:connector-curvature="0" class="st17" d="M331.5,672.3L262,690.4l0.6,2.2c0.4,1.6,2,2.5,3.5,2.1
l63.9-16.7c1.6-0.4,2.5-2,2.1-3.5L331.5,672.3z"/>
<path id="path148" inkscape:connector-curvature="0" class="st13" d="M321.5,681.8L274.8,694l0.7,2.8c0.4,1.3,1.7,2.1,2.9,1.7
l42.1-11c1.3-0.4,2.1-1.7,1.7-2.9L321.5,681.8z"/>
<path id="path150" inkscape:connector-curvature="0" class="st16" d="M309.3,648.2l-40.5,10.5l-3.2-12.2c-1.8-6.7,2.3-13.6,9-15.4
l16-4.2c6.7-1.8,13.6,2.3,15.4,9L309.3,648.2z"/>
<line id="line152" class="st16" x1="281" y1="620.1" x2="283.1" y2="627.9"/>
<line id="line154" class="st16" x1="281.4" y1="637.9" x2="276.1" y2="650.3"/>
<line id="line156" class="st16" x1="294.6" y1="634.9" x2="289.3" y2="647.3"/>
<path id="path158" inkscape:connector-curvature="0" class="st17" d="M287.6,697.6l-2.3,10.1c-0.1,0.5,0.6,0.8,0.8,0.4l8.7-11.7
c0.3-0.4-0.1-0.8-0.5-0.7l-6.3,1.8C287.8,697.3,287.6,697.4,287.6,697.6L287.6,697.6z"/>
<path id="path160" inkscape:connector-curvature="0" class="st17" d="M278.3,699.9l-4.8,9.3c-0.3,0.5,0.4,0.9,0.7,0.6l10.9-10.9
c0.4-0.4,0-0.9-0.5-0.8l-6.1,1.6C278.5,699.8,278.4,699.8,278.3,699.9L278.3,699.9z"/>
<path id="path162" inkscape:connector-curvature="0" class="st17" d="M311.6,691.2l7,7.6c0.4,0.4-0.1,0.9-0.6,0.7l-13.3-6.1
c-0.4-0.2-0.4-0.7,0.1-0.9l6.3-1.6C311.3,691,311.5,691.1,311.6,691.2z"/>
<path id="path164" inkscape:connector-curvature="0" class="st17" d="M320.8,688.7l8.8,5.8c0.5,0.3,0.1,1-0.4,0.8l-14.9-4.2
c-0.5-0.1-0.5-0.8,0-0.9l6.1-1.6C320.6,688.6,320.7,688.6,320.8,688.7z"/>
</g>
<path id="path168" inkscape:connector-curvature="0" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st18" d="
M241.1,524.1h-1c0.4-1.2,0.6-2.4,0.6-3.7c0-7-5.7-12.7-12.7-12.7h-3.3c-2.7,0-5.2,0.9-7.2,2.3c-1.2-0.8-2.7-1.3-4.3-1.3
c-0.7,0-1.4,0.1-2.1,0.3c-0.1-3.9-3.3-7.1-7.2-7.1h-0.3c-2.2,0-4.1,1-5.4,2.5l0,0c2.7-3.8,4.4-8.5,4.4-13.6
c0-13-10.6-23.5-23.5-23.5c-13,0-23.5,10.6-23.5,23.5c0,0.7,0,1.3,0.1,1.9c-2.9,0.3-5.7,1.3-8,2.9c-3.2-4.3-8.2-7.1-13.9-7.1l0,0
c-9.5,0-17.3,7.8-17.3,17.3c0,0.1,0,0.3,0,0.4c-1.8-0.9-3.7-1.5-5.9-1.5c-6.9,0-12.6,5.7-12.6,12.6c0,0.7,0.1,1.5,0.2,2.2H92
c-3,0-5.5,1.9-6.5,4.5h-9.9c-2.5,0-4.5,2-4.5,4.5s2,4.5,4.5,4.5h149.2h3.3h13.1c2.5,0,4.5-2,4.5-4.5
C245.6,526.1,243.6,524.1,241.1,524.1L241.1,524.1z"/>
<path id="path170" inkscape:connector-curvature="0" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st18" d="
M898.4,516.3h-0.9c0.3-1,0.5-2.1,0.5-3.3c0-6.2-5.1-11.3-11.3-11.3h-2.9c-2.4,0-4.6,0.8-6.4,2c-1.1-0.7-2.4-1.1-3.8-1.1
c-0.6,0-1.2,0.1-1.8,0.3c-0.1-3.5-2.9-6.3-6.4-6.3H865c-1.9,0-3.6,0.9-4.8,2.2l0,0c2.4-3.4,3.9-7.6,3.9-12
c0-11.5-9.4-20.9-20.9-20.9s-20.9,9.4-20.9,20.9c0,0.6,0,1.2,0.1,1.7c-2.6,0.2-5,1.2-7.1,2.6c-2.8-3.8-7.3-6.3-12.3-6.3l0,0
c-8.5,0-15.4,6.9-15.4,15.4c0,0.1,0,0.2,0,0.4c-1.6-0.8-3.3-1.4-5.2-1.4c-6.2,0-11.2,5-11.2,11.2c0,0.7,0.1,1.3,0.2,1.9h-5.5
c-2.6,0-4.9,1.7-5.8,4h-8.8c-2.2,0-4,1.8-4,4s1.8,4,4,4h132.5h2.9h11.6c2.2,0,4-1.8,4-4C902.4,518.1,900.6,516.3,898.4,516.3
L898.4,516.3z"/>
<g id="g184" transform="translate(10.641067,-115.56078)" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476">
<g id="g178">
<path id="path172" inkscape:connector-curvature="0" class="st19" d="M149.5,586c2.4,4.6,6.2,8.3,10.8,10.8c3.6,1.9,7.6,3,11.9,3
c14.2,0,25.7-11.5,25.7-25.7c0-4.3-1.1-8.4-3-11.9c-2.4-4.6-6.2-8.3-10.8-10.8c-3.6-1.9-7.6-3-11.9-3c-14.2,0-25.7,11.5-25.7,25.7
C146.6,578.4,147.7,582.4,149.5,586z"/>
<path id="path174" inkscape:connector-curvature="0" class="st18" d="M194.1,562.5c-2.3-4.4-6-8.1-10.4-10.4
c-3.4-1.8-7.4-2.9-11.5-2.9c-9.5,0-17.7,5.3-21.8,13.1c4-2.8,8.9-4.4,14.1-4.4c4.2,0,8.1,1,11.5,2.9c4.4,2.3,8.1,6,10.4,10.4
c1.8,3.4,2.9,7.4,2.9,11.5c0,4.2-1.1,8.2-2.9,11.7c6.4-4.5,10.6-11.9,10.6-20.3C197,569.8,196,565.9,194.1,562.5L194.1,562.5z"/>
<path id="path176" inkscape:connector-curvature="0" class="st20" d="M149.5,586c-7.7,10-11.6,17.8-9.3,20.1s10.1-1.6,20.1-9.3
c5.6-4.4,12-9.9,18.3-16.3c6.4-6.4,11.9-12.7,16.3-18.3c7.7-10,11.6-17.8,9.3-20.1s-10.1,1.6-20.1,9.3"/>
</g>
<path id="path180" inkscape:connector-curvature="0" class="st21" d="M154,566.3c0.5-1.2,1.1-2.3,1.8-3.4c0.7-1.1,1.5-2,2.4-2.9
s1.9-1.7,2.9-2.4c1.1-0.7,2.2-1.3,3.4-1.8s2.4-0.9,3.7-1.2s2.6-0.4,4-0.4"/>
<path id="path182" inkscape:connector-curvature="0" class="st21" d="M152.4,574.1c0-1.4,0.1-2.7,0.4-4"/>
</g>
<g id="g192" transform="translate(-0.2304,235.22748)" inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476">
<polygon id="polygon186" class="st19" points="843.4,216.1 850,204.1 856.6,216.1 868.7,222.7 856.6,229.3 850,241.4 843.4,229.3
831.4,222.7 "/>
<polygon id="polygon188" class="st19" points="868.4,248.1 873.4,239.1 878.3,248.1 887.4,253.1 878.3,258 873.4,267.1 868.4,258
859.4,253.1 "/>
<polygon id="polygon190" class="st19" points="884.1,207.8 887.4,201.7 890.7,207.8 896.7,211.1 890.7,214.4 887.4,220.4
884.1,214.4 878,211.1 "/>
</g>
<g inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476" class="st22">
<path class="st23" d="M332.8,346.5l65.5-89.9h24.1v89.6h17.5v20.8h-17.5v28.8h-27.9v-28.8h-61.7V346.5z M358.9,346.1h35.6v-34.4
c0-5.9,0.2-11.6,0.8-17h-0.9c-4.9,7.8-8,12.7-9.4,14.6L358.9,346.1z"/>
<path class="st23" d="M450.4,326.2c0-11.1,1.1-21,3.2-29.6c2.1-8.6,4.9-15.5,8.2-20.7c3.3-5.2,7.3-9.4,11.9-12.8
c4.6-3.3,9.1-5.6,13.5-6.8c4.4-1.2,9-1.8,13.7-1.8c16.3,0,28.8,6.4,37.5,19.1c8.7,12.7,13.1,30.3,13.1,52.6
c0,22.1-4.4,39.6-13.1,52.4c-8.7,12.8-21.2,19.2-37.4,19.2c-4.5,0-8.9-0.6-13.2-1.7c-4.3-1.1-8.7-3.3-13.4-6.6
c-4.6-3.2-8.7-7.4-12.1-12.5c-3.5-5.1-6.3-12-8.6-20.8C451.5,347.6,450.4,337.6,450.4,326.2z M479.4,326.2
c0,33.8,7.2,50.7,21.6,50.7c14.2,0,21.3-16.9,21.3-50.7c0-33.8-7.2-50.7-21.5-50.7C486.5,275.5,479.4,292.4,479.4,326.2z"/>
<path class="st23" d="M562.4,346.5l65.5-89.9h24.1v89.6h17.5v20.8h-17.5v28.8h-27.8v-28.8h-61.7V346.5z M588.5,346.1h35.6v-34.4
c0-5.9,0.2-11.6,0.8-17h-0.9c-4.9,7.8-8,12.7-9.4,14.6L588.5,346.1z"/>
</g>
<g inkscape:export-xdpi="96.009476" inkscape:export-ydpi="96.009476">
<path class="st23" d="M435.8,197.2c0-6.4,1.6-11.6,4.9-15.5c3.2-3.9,7.5-5.9,12.8-5.9c5.3,0,9.6,2,12.8,5.9
c3.2,3.9,4.8,9.1,4.8,15.5c0,6.4-1.6,11.6-4.8,15.4c-3.2,3.9-7.5,5.8-12.8,5.8c-5.3,0-9.6-1.9-12.8-5.8
C437.5,208.8,435.8,203.6,435.8,197.2z M443.6,197.2c0,4.6,0.9,8.2,2.6,10.8c1.7,2.7,4.2,4,7.3,4c3.1,0,5.5-1.3,7.2-4
c1.7-2.7,2.6-6.3,2.6-10.9c0-4.6-0.9-8.2-2.6-10.8c-1.7-2.7-4.1-4-7.2-4c-3.1,0-5.5,1.3-7.2,4C444.5,189,443.6,192.7,443.6,197.2z"
/>
<path class="st23" d="M474.8,202.7c0-4.4,1.2-8.2,3.5-11.2c2.3-3.1,5.8-4.6,10.4-4.6c3.1,0,5.7,0.8,7.9,2.3
c2.2,1.6,3.7,3.5,4.6,5.8c0.9,2.3,1.4,4.9,1.4,7.7c0,1.4-0.1,2.7-0.4,4c-0.2,1.3-0.7,2.7-1.3,4.2c-0.7,1.5-1.5,2.7-2.5,3.8
c-1,1.1-2.3,2-4,2.7c-1.7,0.7-3.6,1.1-5.7,1.1c-2.1,0-4-0.3-5.6-1c-1.7-0.7-3-1.5-4-2.6c-1-1.1-1.8-2.3-2.5-3.7
c-0.7-1.4-1.1-2.8-1.4-4.2C475,205.6,474.8,204.1,474.8,202.7z M482.3,202.7c0,3.4,0.6,5.8,1.9,7.3c1.3,1.5,2.8,2.3,4.5,2.3
c1.7,0,3.1-0.8,4.5-2.3c1.3-1.5,2-4,2-7.3c0-3.4-0.7-5.8-2-7.4c-1.3-1.5-2.8-2.3-4.5-2.3c-1.7,0-3.2,0.8-4.5,2.3
C482.9,196.8,482.3,199.3,482.3,202.7z"/>
<path class="st23" d="M507.9,229.5v-41.8h6.9v3.3c1.9-2.8,4.3-4.3,7.4-4.3c3.6,0,6.7,1.4,9.1,4.3c2.4,2.8,3.6,6.7,3.6,11.5
c0,2.8-0.4,5.2-1.2,7.3c-0.8,2.1-1.8,3.8-3.1,5c-1.3,1.2-2.6,2.1-4.1,2.7c-1.4,0.6-2.9,0.9-4.4,0.9c-1.8,0-3.3-0.4-4.5-1.2
c-1.2-0.8-2.1-1.7-2.7-2.7v14.9H507.9z M515,202.6c0,3,0.5,5.3,1.6,7.1c1.1,1.7,2.6,2.6,4.6,2.6c1.9,0,3.4-0.8,4.6-2.5
c1.1-1.7,1.7-4.1,1.7-7.2c0-3-0.6-5.3-1.7-7c-1.1-1.7-2.7-2.6-4.6-2.6c-2,0-3.5,0.9-4.6,2.7C515.6,197.4,515,199.8,515,202.6z"/>
<path class="st23" d="M537.8,211.3l5.1-3c2,3,4.6,4.5,7.9,4.5c1.5,0,2.6-0.3,3.4-0.9c0.8-0.6,1.2-1.4,1.2-2.3c0-0.3,0-0.6-0.1-0.8
c-0.1-0.3-0.2-0.5-0.4-0.7c-0.2-0.2-0.4-0.4-0.6-0.6c-0.2-0.2-0.4-0.4-0.8-0.6c-0.4-0.2-0.7-0.4-0.9-0.5c-0.2-0.1-0.6-0.3-1.1-0.5
c-0.5-0.2-0.9-0.3-1.2-0.4c-0.3-0.1-0.7-0.2-1.3-0.4c-0.6-0.2-1-0.3-1.3-0.4c-2.6-0.8-4.6-1.9-6.2-3.2c-1.6-1.3-2.3-3.2-2.3-5.7
c0-2.7,1.1-4.8,3.4-6.4c2.3-1.6,5-2.4,8.3-2.4c2.5,0,4.8,0.6,7,1.9c2.2,1.3,3.8,3,4.9,5l-4.8,2.9c-2.1-2.7-4.5-4-7.1-4
c-1.4,0-2.5,0.3-3.2,0.8c-0.8,0.6-1.1,1.3-1.1,2.2c0,0.3,0,0.6,0.1,0.9c0.1,0.3,0.2,0.5,0.4,0.8c0.2,0.2,0.4,0.4,0.6,0.6
c0.2,0.2,0.5,0.4,0.8,0.6c0.4,0.2,0.7,0.4,0.9,0.5c0.3,0.1,0.6,0.3,1.1,0.4c0.5,0.2,0.8,0.3,1.1,0.4c0.3,0.1,0.7,0.2,1.2,0.4
c0.5,0.2,0.9,0.3,1.2,0.4c5.9,2,8.9,4.9,8.9,8.9c0,2.5-1,4.6-3.1,6.4c-2.1,1.7-5,2.6-8.8,2.6c-2.9,0-5.6-0.7-7.9-2
S539.1,213.4,537.8,211.3z"/>
<path class="st23" d="M568.2,217.7v-8.3h9.3v8.3H568.2z M576.4,205.9h-7l-0.5-29.4h8L576.4,205.9z"/>
</g>
<g>
<path class="st23" d="M333.4,464.8v-27.1h12c3.2,0,5.7,0.8,7.5,2.3c1.8,1.5,2.7,3.6,2.7,6.1c0,2.6-0.9,4.6-2.8,6.1
s-4.3,2.2-7.4,2.2H339v10.3H333.4z M339,450.2h5.7c1.6,0,2.9-0.4,3.8-1.1c0.9-0.7,1.4-1.7,1.4-3c0-1.3-0.4-2.3-1.3-3
c-0.9-0.7-2.2-1.1-3.8-1.1H339V450.2z"/>
<path class="st23" d="M354.1,464.8l10.8-27.1h6l10.8,27.1h-5.9l-2.3-6.4h-11.2l-2.3,6.4H354.1z M363.7,454.4h8.3l-2.2-6.4
c-0.5-1.5-1.1-3.3-1.8-5.5h-0.2c-0.2,0.7-0.5,1.6-0.9,2.8c-0.4,1.2-0.7,2.1-0.9,2.6L363.7,454.4z"/>
<path class="st23" d="M383,451.2c0-4.1,1.3-7.5,3.8-10.2c2.5-2.6,5.9-4,10-4c1.3,0,2.5,0.2,3.7,0.5c1.2,0.3,2.1,0.7,2.9,1.2
c0.8,0.5,1.5,1,2.2,1.7c0.7,0.6,1.2,1.2,1.5,1.7c0.4,0.5,0.7,1,0.9,1.5l-4.8,1.4c-0.4-0.5-0.8-1-1.1-1.3c-0.3-0.3-0.7-0.7-1.2-1.2
s-1.1-0.8-1.8-1s-1.4-0.3-2.3-0.3c-2.5,0-4.5,0.9-5.9,2.7c-1.4,1.8-2.1,4.2-2.1,7.2c0,3,0.7,5.4,2.2,7.2c1.5,1.8,3.3,2.7,5.6,2.7
c2,0,3.5-0.5,4.6-1.4c1.1-1,1.8-2.2,2-3.8c0.1-1.3,0.2-2,0.2-2.1h-7.1v-4.3h12.5v15.3H405l-0.5-2.4c-1.7,2-4.4,3-8,3
c-3.8,0-7-1.3-9.6-3.8C384.3,459,383,455.5,383,451.2z"/>
<path class="st23" d="M413.9,464.8v-27.1H434v4.3h-14.6v6.7h13.1v4.2h-13.1v7.6h14.9v4.3H413.9z"/>
<path class="st23" d="M448.1,464.8v-27.1h5.8l8.5,14.1l3.3,5.5h0.2c-0.4-2.6-0.6-5.2-0.6-7.9v-11.7h5.6v27.1h-5.8l-8.5-13.8
l-3.3-5.7h-0.2c0.4,2.5,0.6,5.1,0.6,7.9v11.6H448.1z"/>
<path class="st23" d="M475.4,451.3c0-4.3,1.2-7.7,3.6-10.3c2.4-2.6,5.6-3.9,9.5-3.9c4,0,7.1,1.3,9.5,3.9c2.4,2.6,3.6,6,3.6,10.3
c0,4.3-1.2,7.7-3.6,10.3c-2.4,2.6-5.6,3.9-9.5,3.9s-7.1-1.3-9.5-3.9C476.6,458.9,475.4,455.5,475.4,451.3z M481.2,451.3
c0,3,0.6,5.4,1.9,7.2c1.3,1.8,3.1,2.7,5.4,2.7c2.3,0,4.1-0.9,5.4-2.6c1.3-1.8,1.9-4.2,1.9-7.2c0-3-0.6-5.4-1.9-7.2
c-1.3-1.8-3.1-2.7-5.4-2.7c-2.3,0-4.1,0.9-5.4,2.7C481.9,445.8,481.2,448.2,481.2,451.3z"/>
<path class="st23" d="M502.7,442v-4.3h22.7v4.3h-8.6v22.8h-5.6V442H502.7z"/>
<path class="st23" d="M538.3,464.8v-27.1h19.6v4.3h-13.9v7.1h12.5v4.2h-12.5v11.4H538.3z"/>
<path class="st23" d="M559.9,451.3c0-4.3,1.2-7.7,3.6-10.3c2.4-2.6,5.6-3.9,9.5-3.9c4,0,7.1,1.3,9.5,3.9c2.4,2.6,3.6,6,3.6,10.3
c0,4.3-1.2,7.7-3.6,10.3c-2.4,2.6-5.6,3.9-9.5,3.9c-4,0-7.1-1.3-9.5-3.9C561.1,458.9,559.9,455.5,559.9,451.3z M565.7,451.3
c0,3,0.6,5.4,1.9,7.2c1.3,1.8,3.1,2.7,5.4,2.7c2.3,0,4.1-0.9,5.4-2.6c1.3-1.8,1.9-4.2,1.9-7.2c0-3-0.6-5.4-1.9-7.2
c-1.3-1.8-3.1-2.7-5.4-2.7c-2.3,0-4.1,0.9-5.4,2.7C566.3,445.8,565.7,448.2,565.7,451.3z"/>
<path class="st23" d="M590.5,454.6v-16.9h5.6v16.9c0,4.4,1.9,6.5,5.7,6.5c3.8,0,5.7-2.2,5.7-6.5v-16.9h5.6v16.9
c0,3.5-0.9,6.2-2.8,8c-1.9,1.9-4.7,2.8-8.5,2.8c-3.6,0-6.4-0.9-8.4-2.7S590.5,458.2,590.5,454.6z"/>
<path class="st23" d="M619.3,464.8v-27.1h5.8l8.5,14.1l3.3,5.5h0.2c-0.4-2.6-0.6-5.2-0.6-7.9v-11.7h5.6v27.1h-5.8l-8.5-13.8
l-3.3-5.7h-0.2c0.4,2.5,0.6,5.1,0.6,7.9v11.6H619.3z"/>
<path class="st23" d="M648.3,464.8v-27.1h9.2c4.6,0,8.2,1.2,10.7,3.5s3.7,5.7,3.7,10c0,1.4-0.1,2.8-0.4,4c-0.3,1.3-0.8,2.5-1.4,3.7
c-0.7,1.2-1.6,2.2-2.6,3c-1.1,0.8-2.4,1.5-4.1,2c-1.7,0.5-3.6,0.8-5.8,0.8H648.3z M653.9,460.5h3c3,0,5.3-0.7,6.8-2.2
c1.5-1.5,2.3-3.8,2.3-7.1c0-3.3-0.8-5.7-2.4-7.2s-3.8-2.1-6.7-2.1h-3.1V460.5z"/>
</g>
</svg>

After

Width:  |  Height:  |  Size: 22 KiB

View File

Before

Width:  |  Height:  |  Size: 38 KiB

After

Width:  |  Height:  |  Size: 38 KiB

View File

Before

Width:  |  Height:  |  Size: 26 KiB

After

Width:  |  Height:  |  Size: 26 KiB

View File

Before

Width:  |  Height:  |  Size: 36 KiB

After

Width:  |  Height:  |  Size: 36 KiB

View File

Before

Width:  |  Height:  |  Size: 31 KiB

After

Width:  |  Height:  |  Size: 31 KiB

View File

Before

Width:  |  Height:  |  Size: 1.8 KiB

After

Width:  |  Height:  |  Size: 1.8 KiB

View File

Before

Width:  |  Height:  |  Size: 2.3 KiB

After

Width:  |  Height:  |  Size: 2.3 KiB

View File

Before

Width:  |  Height:  |  Size: 5.9 KiB

After

Width:  |  Height:  |  Size: 5.9 KiB

BIN
docs/_static/QR_Dilated_Convolution.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.1 KiB

BIN
docs/_static/QR_MFCC.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.5 KiB

BIN
docs/_static/QR_multinet_g2p.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 6.7 KiB

View File

Before

Width:  |  Height:  |  Size: 17 KiB

After

Width:  |  Height:  |  Size: 17 KiB

View File

Before

Width:  |  Height:  |  Size: 25 KiB

After

Width:  |  Height:  |  Size: 25 KiB

View File

Before

Width:  |  Height:  |  Size: 43 KiB

After

Width:  |  Height:  |  Size: 43 KiB

BIN
docs/_static/icon-green-check.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.0 KiB

BIN
docs/_static/icon-orange-check.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.9 KiB

BIN
docs/_static/icon-red-cross.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 2.1 KiB

32
docs/_static/js/at_versions.js vendored Normal file
View File

@ -0,0 +1,32 @@
var DOCUMENTATION_VERSIONS = {
DEFAULTS: { has_targets: false,
supported_targets: [ "esp32s3" ]
},
VERSIONS: [
// latest
{ name: "latest", has_targets: true, supported_targets: [ "esp32", "esp32c2", "esp32c3" ] },
// v2.4.0.0
{ name: "release-v2.4.0.0", has_targets: true, supported_targets: [ "esp32", "esp32c3" ]},
// v2.3.0.0
{ name: "release-v2.3.0.0_esp32c3", has_targets: false, supported_targets: [ "esp32c3" ]},
// v2.2.0.0
{ name: "release-v2.2.0.0_esp32c3", has_targets: false, supported_targets: [ "esp32c3" ]},
{ name: "release-v2.2.0.0_esp32", has_targets: false, supported_targets: [ "esp32", "esp32s2" ]},
{ name: "release-v2.2.0.0_esp8266", has_targets: false, supported_targets: [ "esp8266" ]},
// v2.1.0.0
{ name: "release-v2.1.0.0_esp32", has_targets: false, supported_targets: [ "esp32" ]},
{ name: "release-v2.1.0.0_esp8266", has_targets: false, supported_targets: [ "esp8266" ]},
{ name: "release-v2.1.0.0_esp32s2", has_targets: false, supported_targets: [ "esp32s2" ]},
],
IDF_TARGETS: [
{ text: "ESP32-C2 (ESP8684)", value: "esp32c2"},
{ text: "ESP32-C3", value: "esp32c3"},
{ text: "ESP32", value: "esp32"},
{ text: "ESP8266", value: "esp8266"},
{ text: "ESP32-S2", value: "esp32s2"},
]
};

View File

Before

Width:  |  Height:  |  Size: 57 KiB

After

Width:  |  Height:  |  Size: 57 KiB

View File

Before

Width:  |  Height:  |  Size: 29 KiB

After

Width:  |  Height:  |  Size: 29 KiB

View File

Before

Width:  |  Height:  |  Size: 21 KiB

After

Width:  |  Height:  |  Size: 21 KiB

View File

Before

Width:  |  Height:  |  Size: 16 KiB

After

Width:  |  Height:  |  Size: 16 KiB

View File

Before

Width:  |  Height:  |  Size: 2.1 KiB

After

Width:  |  Height:  |  Size: 2.1 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 15 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 14 KiB

BIN
docs/_static/test_response_time.png vendored Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 36 KiB

View File

Before

Width:  |  Height:  |  Size: 103 KiB

After

Width:  |  Height:  |  Size: 103 KiB

View File

Before

Width:  |  Height:  |  Size: 22 KiB

After

Width:  |  Height:  |  Size: 22 KiB

View File

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 20 KiB

View File

Before

Width:  |  Height:  |  Size: 21 KiB

After

Width:  |  Height:  |  Size: 21 KiB

View File

Before

Width:  |  Height:  |  Size: 20 KiB

After

Width:  |  Height:  |  Size: 20 KiB

View File

@ -1,219 +0,0 @@
# Acoustic Algorithm Introduction
Acoustic algorithms provided in esp-sr include voice activity detection (VAD), adaptive gain control (AGC), acoustic echo cancellation (AEC), noise suppression (NS), and mic-array speech enhancement (MASE). VAD, AGC, AEC, and NS are supported with either single-mic and multi-mic development board, MASE is supported with multi-mic board only.
## VAD
### Overview
VAD takes an audio stream as input, and outputs the prediction that a frame of the stream contains audio or not.
### API Reference
#### Header
- esp_vad.h
#### Function
- `vad_handle_t vad_create(vad_mode_t vad_mode)`
**Definition**
Initialization of VAD handle.
**Parameter**
- vad_mode: operating mode of VAD, VAD_MODE_0 to VAD_MODE_4, larger value indicates more aggressive VAD.
**Return**
Handle to VAD.
- `vad_state_t vad_process(vad_handle_t inst, int16_t *data, int sample_rate_hz, int one_frame_ms);`
**Definition**
Processing of VAD for one frame.
**Parameter**
- inst: VAD handle.
- data: buffer to save both input and output audio stream.
- sample_rate_hz: The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
- one_frame_ms: The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
**Return**
- VAD_SILENCE if no voice
- VAD_SPEECH if voice is detected
- `void vad_destroy(vad_handle_t inst)`
**Definition**
Destruction of a VAD handle.
**Parameter**
- inst: the VAD handle to be destroyed.
## AGC
### Overview
AGC keeps the volume of audio signal at a stable level to avoid the situation that the signal is so loud that gets clipped or too quiet to trigger the speech recognizer.
### API Reference
- `void *esp_agc_open(int agc_mode, int sample_rate)`
**Definition**
Initialization of AGC handle.
**Parameter**
- agc_mode: operating mode of AGC, 3 to enable AGC and 0 to disable it.
- sample_rate: sampling rate of audio signal.
**Return**
- AGC handle.
- `int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate)`
**Definition**
Pocessing of AGC for one frame.
**Parameter**
- agc_handle: AGC handle.
- in_pcm: input audio stream.
- out_pcm: output audio stream.
- frame_size: signal frame length in ms.
- sample_rate: signal sampling rate in Hz.
**Return**
Return 0 if AGC processing succeeds, -1 if fails; -2 and -3 indicate invalid input of sample_rate and frame_size, respectively.
- `void esp_agc_clse(void *agc_handle)`
**Definition**
Destruction of an AGC handle.
**Parameter**
- agc_handle: the AGC handle to be destroyed.
## AEC
### Overview
AEC suppresses echo of the sound played by the speaker of the board.
### API Reference
- `aec_handle_t aec_create(int sample_rate, int frame_length, int filter_length)`
**Definition**
Initialization of AEC handle.
**Parameter**
- sample_rate: audio signal sampling rate.
- frame_length: audio frame length in ms.
- filter_length: the length of adaptive filter in AEC.
**Return**
Handle to AEC.
- `aec_create_t aec_create_multimic(int sample_rate, int frame_length, int filter_length, int nch)`
**Definition**
Initialization of AEC handle.
**Parameter**
- sample_rate: audio signal sampling rate.
- frame_length: audio frame length in ms.
- filter_length: the length of adaptive filter in AEC.
- nch: number of channels of the signal to be processed.
**Return**
Handle to AEC.
- `void aec_process(aec_handle_t inst, int16_t *indata, int16_t *refdata, int16_t *outdata)`
**Definition**
Processing of AEC for one frame.
**Parameter**
- inst: AEC handle.
- indata: input audio stream, which could be single- or multi-channel, depending on the channel number defined on initialization.
- refdata: reference signal to be cancelled from the input.
- outdata: output audio stream, the number of channels is the same as indata.
- `void aec_destroy(aec_handle_t inst)`
**Definition**
Destruction of an AEC handle.
**Parameter**
- inst: the AEC handle to be destroyed.
## NS
### Overview
Single-channel speech enhancement. If multiple mics are available with the board, MASE is recommened for noise suppression.
### API Reference
- `ns_handle_t ns_pro_create(int frame_length, int mode)`
**Definition**
Creates an instance of the more powerful noise suppression algorithm.
**Parameter**
- frame_length_ms: audio frame length in ms.
- mode: 0: Mild, 1: Medium, 2: Aggressive
**Return**
Handle to NS.
- `void ns_process(ns_handle_t inst, int16_t *indata, int16_t *outdata)`
**Definition**
Prodessing of NS for one frame.
**Parameter**
- inst: NS handle.
- indata: input audio stream.
- outdata: output audio stream.
- `void ns_destroy(ns_handle_t inst)`
**Definition**
Destruction of a NS handle.
**Parameter**
- inst: the NS handle to be destroyed.

View File

@ -1,55 +0,0 @@
# Espressif Microphone Design Guidelines
> This document provides microphone design guidelines and suggestions for the ESP32-S3 series of audio development boards.
###Electrical Performance
1. Type: omnidirectional MEMS microphone
2. Sensitivity
- Under 1 Pa sound pressure, it should be no less than -38 dBV for analog microphones, and -26 dB for digital microphones.
- The tolerance should be controlled within ±2 dB, and within ±1 dB for microphone arrays.
3. Signal-to-noise ratio (SNR)
- No less than 62 dB. Higher than 64 dB is recommended.
- Frequency response fluctuates within ±3 dB from 50 to 16 kHz.
- PSRR should be larger than 55 dB for MEMS microphones.
---
###Structure Design
1. The aperture or width of the microphone hole is recommended to be greater than 1 mm, the pickup pipe should be as short as possible, and the cavity should be as small as possible to ensure that the resonance frequency of the microphone and structural components is above 9 kHz.
2. The depth and diameter of the pickup hole are less than 4:1, and the thickness of the shell is recommended to be 1 mm. If the shell is too thick, the opening area must be increased.
3. The microphone hole must be protected by an anti-dust mesh.
4. Silicone sleeve or foam must be added between the microphone and the device shell for sealing and shockproofing, and an interference fit design is required to ensure the tightness of the microphone.
5. The microphone hole cannot be blocked. The bottom microphone hole needs to be increased in structure to prevent it from being blocked by the desktop.
6. The microphone should be placed far away from the speaker and other objects that can produce noise or vibration, and be isolated and buffered by rubber pads from the speaker sound cavity.
---
###Microphone Array Design
1. Type: omnidirectional MEMS microphone. Use the same models from the same manufacturer for the array. Not recommended mixing different microphones.
2. The sensitivity difference among microphones in the array is within 3 dB.
3. The phase difference among the microphones in the array is controlled within 10°.
4. It is recommended to keep the structural design of each microphone in the array the same to ensure consistency.
5. Two-microphone solution: the distance between the microphones should be 4 ~ 6.5 cm, the axis connecting them should be parallel to the horizontal line, and the center of the two microphones should be horizontally as close as possible to the center of the product.
6. Three-microphone solution: the microphones are equally spaced and distributed in a perfect circle with the angle 120 degrees from each other, and the spacing should be 4 ~ 6.5 cm.
---
###Microphone Structure Tightness
Use plasticine or other materials to seal the microphone pickup hole and compare how much the signals collected by the microphone decrease by before and after the seal. 25 dB is qualified, and 30 dB is recommended. Below are the test procedures.
1. Play white noise at 0.5 meters above the microphone, and keep the volume at the microphone 90 dB.
2. Use the microphone array to record for more than 10 s, and store it as recording file A.
3. Use plasticine or other materials to block the microphone pickup hole, record for more than 10 s, and store it as recording file B.
4. Compare the frequency spectrum of the two files and make sure that the overall attenuation in the 100 ~ 8 kHz frequency band is more than 25 dB.
---
###Echo Reference Signal Design
1. It is recommended that the echo reference signal be as close to the speaker side as possible, and recover from the DAC post-stage and PA pre-stage.
2. When the speaker volume is at its maximum, the echo reference signal input to the microphone should not have saturation distortion. At the maximum volume, the speaker amplifier output THD is less than 10% at 100 Hz, less than 6% at 200 Hz, and less than 3% above 350 Hz.
3. When the speaker volume is at its maximum, the sound pressure picked up by the microphone does not exceed 102 dB @ 1 kHz.
4. The echo reference signal voltage does not exceed the maximum allowed input voltage of the ADC. If it is too high, an attenuation circuit should be added.
5. A low-pass filter should be added to introduce the reference echo signal from the output of the Class D power amplifier. The cutoff frequency of the filter is recommended to be more than 22 kHz.
6. When the volume is played at the maximum, the recovery signal peak value is -3 to -5 dB.
---
###Microphone Array Consistency
It is required that the difference between the sampled signals of each microphone is less than 3 dB. Below are the test procedures.
1. Play white noise at 0.5 meters above the microphone, and keep the volume at the microphone 90 dB.
2. Use the microphone array to record for more than 10 s, and check whether the recording amplitude and audio sampling rate of each microphone are consistent.

View File

@ -1,363 +0,0 @@
# Audio Front-end Framework[[中文]](./README_CN.md)
Espressif Audio Front-end (AFE) algorithm framework is independently developed by ESPRESSIF AI Lab. Based on ESP32 series chips, the framework can provide high-quality and stable audio data.
---
## Summary
Espressif AFE provides the most convenient way to do audio front-end processing on ESP32 series chips. Espressif AFE framework stably get high-quality audio data for further wake-up or speech recognition.
Espressif AFE is divided into two sets of algorithms: 1) for speech recognition scenarios; 2) for voice communication scenarios. Shown as below:
- Speech recognition scenarios
![overview](../img/AFE_SR_overview.png)
- Voice communication scenarios
![overview](../img/AFE_VOIP_overview.png)
The data flow of Espressif AFE is also divided into two scenarios, shown as below:
- Speech recognition scenarios
![overview](../img/AFE_SR_workflow.png)
The workflow is as follows:
1) Use **ESP_AFE_SR_HANDLE** to create and initialize AFE (`voice_communication_init` needs to be configured as false)
2) AFE feed: Input audio data and will run AEC in the feed function
3) Internal: BSS/NS algorithm processing will be carried out.
4) AFE fetch: Return the audio data and the related information after processing. VAD processing and wake-up word detection will be carried out inside the fetch. The specific behavior depends on the config of `afe_config_t` structure. (Note: `wakenet_Init` and `voice_communication_Init` cannot be configured to true at the same time)
- Voice communication scenarios
![overview](../img/AFE_VOIP_workflow.png)
The workflow is as follows:
1) Use **ESP_AFE_VC_HANDLE** to create and initialize AFE (`voice_communication_init` needs to be configured as true)
2) AFE feed: Input audio data and will run AEC in the feed function
3) Internal: BSS/NS algorithm processing will be carried out. If it's dual MIC, the miso algorithm processing will be carried out later.
4) AFE fetch: Return the audio data and the related information after processing. The AGC algorithm processing will be carried out. And the specific gain depends on the config of `afe_config_t` structure. If it's dual MIC, the NS algorithm processing will be carried out before AGC.(Note: `wakenet_Init` and `voice_communication_Init` cannot be configured to true at the same time)
**Note:** `afe->feed()` and `afe->fetch()` are visible to users, while `internal BSS/NS/MISO task` is invisible to users.
> AEC runs in `afe->feed()` function; If aec_init is configured as false, BSS/NS will run in the afe->feed() function.
> BSS/NS/MISO is an independent task in AFE;
> The results of VAD/WakeNet and the audio data after processing are obtained by `afe->fetch()` function.
### Select AFE handle
Espressif AFE supports both single MIC and dual MIC scenarios, and the algorithm module can be flexibly configured. The internal task of single MIC applications is processed by NS, and the internal task of dual MIC applications is processed by BSS. If the dual microphone scenario is configured for voice communication (i.e. `wakenet_init=false, voice_communication_init=true`), the miso internal task will be added.
For the acquisition of AFE handle, there is a slight difference between speech recognition scenario and voice communication scenario:
- Speech recognition
esp_afe_sr_iface_t *afe_handle = &ESP_AFE_SR_HANDLE;
- Voice communication
esp_afe_sr_iface_t *afe_handle = &ESP_AFE_VC_HANDLE;
### Input Audio data
The AFE supports two kinds of scenarios: single MIC and dual MIC. The number of channels can be configured according to the audio of `afe->feed()`. Modify method: It can modify the `pcm_config` configuration in macro `AFE_CONFIG_DEFAULT()`. It supports the following configuration combinations (Note: It must meet `total_ch_num = mic_num + ref_num`) :
> total_ch_num=1, mic_num=1, ref_num=0
> total_ch_num=2, mic_num=1, ref_num=1
> total_ch_num=2, mic_num=2, ref_num=0
> total_ch_num=3, mic_num=2, ref_num=1
(Note: total_ch_num: the number of total channels, mic_num: the number of microphone channels, ref_num: the number of reference channels)
At present, the AEC only support one reference data , so ref_num can only be 0 or 1.
- AFE single MIC
- Input audio data format: 16KHz, 16bit, two channels (one is mic data, another is reference data) ; If AEC is not required and the audio does not contain reference data. The input data can only have one channel of MIC data, and the ref_num need to be set 0.
- The input data frame length will vary according to the algorithm module configured by the user. Users can use `afe->get_feed_chunksize()` to get the number of sampling points (the data type of sampling points is int16).
The input data is arranged as follows:
<img src="../img/AFE_mode_0.png" height = "100" align=center />
- AFE dual MIC
- Input audio data format: 16KHz, 16bit, three channels (two are mic data, another is reference data) ; If AEC is not required and the audio does not contain reference data. The input data can only have two channels of MIC data, and the ref_num need to be set 0.
- The input data frame length will vary according to the algorithm module configured by the user. Users can use `afe->get_feed_chunksize()` to get the number of sampling points (the data type of sampling points is int16).
The input data is arranged as follows:
<img src="../img/AFE_mode_other.png" height = "70" align=center />
Note: the converted data size is: `afe->get_feed_chunksize * channel number * sizeof(short)`
### AEC Introduction
The AEC (Acoustic Echo Cancellation) algorithm supports maximum two-mic processing, which can effectively remove the echo in the mic input signal, and help with further speech recognition.
### NS (noise suppression)
NS algorithm supports single-channel processing and can suppress the non-human noise in single-channel audio, especially for steady noise.
### BSS (Blind Source Separation)
BSS algorithm supports dual-channel processing, which can well separate the target sound source from the rest of the interference sound, so as to extract the useful audio signal and ensure the quality of the subsequent speech.
### MISO (Multi Input Single Output)
Miso algorithm supports dual channel input and single channel output. It is used to select a channel of audio output with high signal-to-noise ratio when there is no wakenet enable in the dual mic scene.
### VAD (Voice Activity Detection)
VAD algorithm supports real-time output of the voice activity state of the current frame.
### AGC (Automatic Gain Control)
AGC dynamically adjusts the amplitude of the output audio, and amplifies the output amplitude when a weak signal is input; When the input signal reaches a certain strength, the output amplitude will be compressed.
### WakeNet or Bypass
Users can choose whether to detect wake words in AFE. When calling `afe->disable_wakenet(afe_data)`, it will enter bypass mode, and the WakeNet will not run.
### Output Audio
The output audio of AFE is single-channel data. In the speech recognition scenario, AFE will output single-channel data with human voice while WakeNet is enabled. In the voice communication scenario, single channel data with higher signal-to-noise ratio will be output.
---
## Quick Start
### 1. Define afe_handle
`afe_handle ` is the function handle that the user calls the AFE interface. Therefore, the first step is to obtain `afe_handle`.
- Speech recognition
esp_afe_sr_iface_t *afe_handle = &ESP_AFE_SR_HANDLE;
- Voice communication
esp_afe_sr_iface_t *afe_handle = &ESP_AFE_VC_HANDLE;
### 2. Configure AFE
Get the configuration of AFE:
afe_config_t afe_config = AFE_CONFIG_DEFAULT();
Users can adjust the switch of each algorithm module and its corresponding parameters in ` afe_config`:
```
#define AFE_CONFIG_DEFAULT() { \
.aec_init = true, \
.se_init = true, \
.vad_init = true, \
.wakenet_init = true, \
.voice_communication_init = false, \
.voice_communication_agc_init = false, \
.voice_communication_agc_gain = 15, \
.vad_mode = VAD_MODE_3, \
.wakenet_model_name = NULL, \
.wakenet_mode = DET_MODE_2CH_90, \
.afe_mode = SR_MODE_LOW_COST, \
.afe_perferred_core = 0, \
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config.total_ch_num = 3, \
.pcm_config.mic_num = 2, \
.pcm_config.ref_num = 1, \
}
```
- aec_init: Whether the AEC algorithm is enabled.
- se_init: Whether the BSS/NS algorithm is enabled.
- vad_init: Whether the VAD algorithm is enabled ( It can only be used in speech recognition scenarios ).
- wakenet_init: Whether the wake algorithm is enabled.
- voice_communication_init: Whether voice communication is enabled. It cannot be enabled with wakenet_init at the same time.
- voice_communication_agc_init: Whether the AGC is enabled in voice communication.
- voice_communication_agc_gain: The gain of AGC ( unit: dB )
- vad_mode: The VAD operating mode. The bigger, the more radical.
- wakenet_model_name: Its default value is NULL in macro `AFE_CONFIG_DEFAULT()`. At first, you need to choose WakeNet model through `idf.py menuconfig`. Then you need to assign a specific model name to this place before `afe_handle->create_from_config`. The type of value is string. Please refer to[flash_model](../flash_model/README.md)
(Note: In the example, we use the `esp_srmodel_filter()` to get wakenet_model_name. If you choose the multiple wakenet models coexist through menuconfig, this function will return a model name randomly.)
- wakenet_mode: Wakenet mode. It indicate the number of wake-up channels according to the number of MIC channels.
- afe_mode: Espressif AFE supports two working modes: SR_MODE_LOW_COST, SR_MODE_HIGH_PERF. See the afe_sr_mode_t enumeration for details.
- SR_MODE_LOW_COST: The quantified version occupies less resources.
- SR_MODE_HIGH_PERF: The non-quantified version occupies more resources.
**ESP32 only supports SR_MODE_HIGH_PERF;
And ESP32S3 supports both of the modes **
- afe_perferred_core: The internal BSS/NS/MISO algorithm of AFE will be running on which CPU core.
- afe_perferred_priority: The running priority of BSS/NS/MISO algorithm task.
- afe_ringbuf_size: Configuration of internal ringbuf size.
- memory_alloc_mode: Memory allocation mode. Three values can be configured:
- AFE_MEMORY_ALLOC_MORE_INTERNAL: More memory is allocated from internal ram.
- AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE: Part of memory is allocated from internal psram.
- AFE_MEMORY_ALLOC_MORE_PSRAM: Most of memory is allocated from external psram.
- agc_mode: Configuration for linear audio amplification which be used in speech recognition. It only takes effect when wakenet_init is enabled. Four values can be configured:
- AFE_MN_PEAK_AGC_MODE_1: Linearly amplify the audio which will fed to multinet. The peak value is -5 dB.
- AFE_MN_PEAK_AGC_MODE_2: Linearly amplify the audio which will fed to multinet. The peak value is -4 dB.
- AFE_MN_PEAK_AGC_MODE_3: Linearly amplify the audio which will fed to multinet. The peak value is -3 dB.
- AFE_MN_PEAK_NO_AGC: No amplification.
- pcm_config: Configure according to the audio that fed by `afe->feed()`. This structure has three member variables to configure:
- total_ch_num: Total number of audio channelstotal_ch_num = mic_num + ref_num。
- mic_num: The number of microphone channels. It only can be set to 1 or 2.
- ref_num: The number of reference channels. It only can be set to 0 or 1.
### 3. Create afe_data
The user uses the `afe_handle->create_from_config(&afe_config)` function to obtain the data handle, which will be used internally in afe, and the parameters passed in are the configurations obtained in step 2 above.
```
/**
* @brief Function to initialze a AFE_SR instance
*
* @param afe_config The config of AFE_SR
* @returns Handle to the AFE_SR data
*/
typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
```
### 4. feed audio data
After initializing AFE, users need to input audio data into AFE by `afe_handle->feed()` function for processing.
The input audio size and layout format can refer to the step **Input Audio data**.
```
/**
* @brief Feed samples of an audio stream to the AFE_SR
*
* @Warning The input data should be arranged in the format of channel interleaving.
* The last channel is reference signal if it has reference data.
*
* @param afe The AFE_SR object to query
*
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
* `get_feed_chunksize`.
* @return The size of input
*/
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in);
```
Get the number of audio channels:
`afe_handle->get_total_channel_num()` function can provide the number of channels that need to be put into `afe_handle->feed()` function. Its return value is equal to `pcm_config.mic_num + pcm_config.ref_num` in AFE_CONFIG_DEFAULT()
```
/**
* @brief Get the total channel number which be config
*
* @param afe The AFE_SR object to query
* @return The amount of total channels
*/
typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe);
```
### 5. fetch audio data
Users can get the processed single-channel audio and related information by `afe_handle->fetch()` function.
The number of data sampling points of fetch (the data type of sampling point is int16) can be got by `afe_handle->get_fetch_chunksize`.
```
/**
* @brief Get the amount of each channel samples per frame that need to be passed to the function
*
* Every speech enhancement AFE_SR processes a certain number of samples at the same time. This function
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
*
* @param afe The AFE_SR object to query
* @return The amount of samples to feed the fetch function
*/
typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
```
The declaration of `afe_handle->fetch()` is as follows:
```
/**
* @brief fetch enhanced samples of an audio stream from the AFE_SR
*
* @Warning The output is single channel data, no matter how many channels the input is.
*
* @param afe The AFE_SR object to query
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
*/
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
```
Its return value is a pointer of structure, and the structure is defined as follows:
```
/**
* @brief The result of fetch function
*/
typedef struct afe_fetch_result_t
{
int16_t *data; // the data of audio.
int data_size; // the size of data. The unit is byte.
int wakeup_state; // the value is wakenet_state_t
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
int vad_state; // the value is afe_vad_state_t
int trigger_channel_id; // the channel index of output
int wake_word_length; // the length of wake word. It's unit is the number of samples.
int ret_value; // the return state of fetch function
void* reserved; // reserved for future use
} afe_fetch_result_t;
```
### 6. Usage of WakeNet
When users need to perform other operations after wake-up, such as offline or online speech recognition. They can pause the operation of WakeNet to reduce the CPU resource consumption.
Users can call `afe_handle->disable_wakenet(afe_data)` to stop WakeNet, or call `afe_handle->enable_wakenet(afe_data)` to enable WakeNet.
In addition, ESP32S3 chip supports switching between wakenet words. (Note: ESP32 chip only supports one wake-up word and does not support switching). After AFE initialization, the ESP32S3 can switch wakenet word by `afe_handle->set_wakenet()`. For example, `afe_handle->set_wakenet(afe_data, “wn9_hilexin”)` can switch to the "Hi Lexin". How to configure multiple wakenet words, please refer to: [flash_model](../flash_model/README.md)
### 7. Usage of AEC
The usage of AEC is similar to that of WakeNet. Users can disable or enable AEC according to requirements.
- Disable AEC
afe->disable_aec(afe_data);
- Enable AEC
afe->enable_aec(afe_data);

View File

@ -1,363 +0,0 @@
# Audio Front-end 框架[[English]](./README.md)
乐鑫 Audio Front-end(AFE) 算法框架由乐鑫 AI 实验室自主开发。该框架基于 ESP32 系列芯片,能够提供高质量并且稳定的音频数据。
---
## 概述
乐鑫 AFE 框架以最便捷的方式基于乐鑫的 ESP32 系列芯片进行语音前端处理。使用乐鑫 AFE 框架,您可以获取高质量且稳定的音频数据,从而更加方便地构建唤醒或语音识别等应用。
乐鑫 AFE 的功能分为两套1针对语音识别场景2针对语音通话场景。如下所示
- 语音识别场景
![overview](../img/AFE_SR_overview.png)
- 语音通话场景
![overview](../img/AFE_VOIP_overview.png)
乐鑫 AFE 的数据流也相应分为两种场景,如下所示:
- 语音识别场景
![overview](../img/AFE_SR_workflow.png)
工作流程如下:
1) 使用 **ESP_AFE_SR_HANDLE**进行AFE 的创建和初始化 (`voice_communication_init`需配置为 false )
2) AFE feed输入音频数据feed 内部会先进行 AEC 算法处理
3) 内部: 进行 BSS/NS 算法处理
4) AFE fetch返回处理过的单通道音频数据和相关信息 fetch 内部会进行 VAD 处理,以及唤醒词的检测,具体行为取决于用户对 `afe_config_t` 结构体的配置。(注:`wakenet_init` 和 `voice_communication_init` 不可同时配置为 true)
- 语音通话场景
![overview](../img/AFE_VOIP_workflow.png)
工作流程如下:
1) 使用 **ESP_AFE_VC_HANDLE**进行AFE 的创建和初始化 (`voice_communication_init`需配置为 true )
2) AFE feed输入音频数据feed 内部会先进行 AEC 算法处理
3) 内部: 首先进行 BSS/NS 算法处理若为双麦随后还会进行MISO 算法处理;
4) AFE fetch返回处理过的单通道音频数据和相关信息。其中会进行AGC非线性放大具体增益值取决于用户对 `afe_config_t` 结构体的配置若为双麦在AGC之前还会进行降噪处理。(注:`wakenet_init` 和 `voice_communication_init` 不可同时配置为 true)
**Note:** `afe->feed()``afe->fetch()` 对用户可见,`Internal BSS/NS/MISO Task` 对用户不可见。
> AEC 在 afe->feed() 函数中运行;若 aec_init 配置为 false 状态BSS/NS 将会在 afe->feed() 函数中运行。
> BSS/NS/MISO 为 AFE 内部独立 Task 进行处理;
> VAD/WakeNet 的结果,以及处理后的单通道音频,通过 afe->fetch() 函数获取。
### 选择 AFE handle
目前 AFE 支持单麦和双麦两种应用场景,并且可对算法模块进行灵活配置。单麦场景内部 Task 为 NS 处理,双麦场景内部 Task 为 BSS 处理,双麦场景若配置为语音通话(即:`wakenet_init=false, voice_communication_init=true`),则会再增加一个 MISO 的内部 Task。
对于AFE handle的获取语音识别场景与语音通话场景略有差异
- 语音识别
esp_afe_sr_iface_t *afe_handle = &ESP_AFE_SR_HANDLE;
- 语音通话
esp_afe_sr_iface_t *afe_handle = &ESP_AFE_VC_HANDLE;
### 输入音频
目前 AFE 支持单麦和双麦两种应用场景,可根据 `afe->feed()` 的音频,配置相应的音频通道数。修改方式:在宏 `AFE_CONFIG_DEFAULT()` 中对 `pcm_config` 结构体成员进行配置修改,其支持如下几种配置组合 (注:一定要满足 `total_ch_num = mic_num + ref_num`)
> total_ch_num=1, mic_num=1, ref_num=0
> total_ch_num=2, mic_num=1, ref_num=1
> total_ch_num=2, mic_num=2, ref_num=0
> total_ch_num=3, mic_num=2, ref_num=1
(注解: total_ch_num: 总通道数mic_num: 麦克风通道数ref_num: 参考回路通道数)
对于 AEC目前只支持单回路故 ref_num 的值只能为 0 或 1
- AFE 单麦场景
- 输入音频格式为 16KHz, 16bit, 双通道 (1个通道为 mic 数据,另一个通道为参考回路) ; 若不需要 AEC , 音频不包含参考回路则可只包含1个通道 mic 数据ref_num 设置为0。
- 输入数据帧长,会根据用户配置的算法模块不同而有差异, 用户可以使用 `afe->get_feed_chunksize` 来获取需要的采样点数目(采样点数据类型为 int16
数据排布如下:
<img src="../img/AFE_mode_0.png" height = "100" align=center />
- AFE 双麦场景
- 输入音频格式为 16KHz, 16bit, 三通道;若不需要 AEC , 音频不包含参考回路,则可只包含两个通道 mic 数据ref_num 设置为0。
- 输入数据帧长,会根据用户配置的算法模块不同而有差异, 用户可以使用 `afe->get_feed_chunksize` 来获取需要填充的数据量
数据排布如下:
<img src="../img/AFE_mode_other.png" height = "70" align=center />
注意:换算成数据量大小为:`afe->get_feed_chunksize * 通道数 * sizeof(short)`
### AEC 简介
AEC (Acoustic Echo Cancellation) 算法最多支持双麦处理,能够有效的去除 mic 输入信号中的自身播放声音。从而可以在自身播放音乐的情况下进行很好的语音识别等应用。
### NS 简介
NS (Noise Suppression) 算法支持单通道处理,能够对单通道音频中的非人声噪声进行抑制,尤其针对稳态噪声,具有很好的抑制效果。
### BSS 简介
BSS (Blind Source Separation) 算法支持双通道处理,能够很好的将目标声源和其余干扰音进行盲源分离,从而提取出有用音频信号,保证了后级语音的质量。
### MISO 简介
MISO (Multi Input Single Output) 算法支持双通道输入,单通道输出。用于在双麦场景,没有唤醒使能的情况下,选择信噪比高的一路音频输出。
### VAD 简介
VAD (Voice Activity Detection) 算法支持实时输出当前帧的语音活动状态。
### AGC 简介
AGC (Automatic Gain Control) 动态调整输出音频的幅值,当弱信号输入时,放大输出幅度;当输入信号达到一定强度时,压缩输出幅度。
### WakeNet or Bypass 简介
用户可以选择是否在 AFE 中进行唤醒词的识别。当用户调用 `afe->disable_wakenet(afe_data)` 后,则进入 Bypass 模式AFE 模块不会进行唤醒词的识别。
### 输出音频
AFE 的输出音频为单通道数据。在语音识别场景若WakeNet 开启的情况下AFE 会输出有目标人声的单通道数据。在语音通话场景,将会输出信噪比更高的单通道数据。
---
## 快速开始
### 1. 定义 afe_handle
`afe_handle` 是用户后续调用 afe 接口的函数句柄。所以第一步需先获得 `afe_handle`
- 语音识别
esp_afe_sr_iface_t *afe_handle = &ESP_AFE_SR_HANDLE;
- 语音通话
esp_afe_sr_iface_t *afe_handle = &ESP_AFE_VC_HANDLE;
### 2. 配置 afe
获取 afe 的配置:
afe_config_t afe_config = AFE_CONFIG_DEFAULT();
可调整`afe_config`中各算法模块的使能及其相应参数:
```
#define AFE_CONFIG_DEFAULT() { \
.aec_init = true, \
.se_init = true, \
.vad_init = true, \
.wakenet_init = true, \
.voice_communication_init = false, \
.voice_communication_agc_init = false, \
.voice_communication_agc_gain = 15, \
.vad_mode = VAD_MODE_3, \
.wakenet_model_name = NULL, \
.wakenet_mode = DET_MODE_2CH_90, \
.afe_mode = SR_MODE_LOW_COST, \
.afe_perferred_core = 0, \
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config.total_ch_num = 3, \
.pcm_config.mic_num = 2, \
.pcm_config.ref_num = 1, \
}
```
- aec_init: AEC 算法是否使能。
- se_init: BSS/NS 算法是否使能。
- vad_init: VAD 是否使能 ( 仅可在语音识别场景中使用 )
- wakenet_init: 唤醒是否使能。
- voice_communication_init: 语音通话是否使能。与 wakenet_init 不能同时使能。
- voice_communication_agc_init: 语音通话中AGC是否使能。
- voice_communication_agc_gain: AGC的增益值单位为dB。
- vad_mode: VAD 检测的操作模式,越大越激进。
- wakenet_model_name: 宏`AFE_CONFIG_DEFAULT()`中该值默认为NULL。使用 `idf.py menuconfig` 选择了相应的唤醒模型后,在调用`afe_handle->create_from_config`之前,需给该处赋值具体的模型名字,类型为字符串形式。唤醒模型的具体说明,详见:[flash_model](../flash_model/README_cn.md)
(注意:示例代码中,使用了 esp_srmodel_filter() 获取模型名字,若 menuconfig 中选择了多个模型共存,该函数将会随机返回一个模型名字)
- wakenet_mode: 唤醒的模式。对应为多少通道的唤醒根据mic通道的数量选择
- afe_mode: 乐鑫 AFE 目前支持 2 种工作模式分别为SR_MODE_LOW_COST, SR_MODE_HIGH_PERF。详细可见 afe_sr_mode_t 枚举。
- SR_MODE_LOW_COST: 量化版本,占用资源较少。
- SR_MODE_HIGH_PERF: 非量化版本,占用资源较多。
**ESP32 芯片,只支持模式 SR_MODE_HIGH_PERF;
ESP32S3 芯片,两种模式均支持 **
- afe_perferred_core: AFE 内部 BSS/NS/MISO 算法,运行在哪个 CPU 核。
- afe_perferred_priority: AFE 内部 BSS/NS/MISO 算法运行的task优先级。
- afe_ringbuf_size: 内部 ringbuf 大小的配置。
- memory_alloc_mode: 内存分配的模式。可配置三个值:
- AFE_MEMORY_ALLOC_MORE_INTERNAL: 更多的从内部ram分配。
- AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE: 部分从内部ram分配。
- AFE_MEMORY_ALLOC_MORE_PSRAM: 绝大部分从外部psram分配
- agc_mode: 将音频线性放大的 level 配置,该配置在语音识别场景下起作用,并且在唤醒使能时才生效。可配置四个值:
- AFE_MN_PEAK_AGC_MODE_1: 线性放大喂给后续multinet的音频峰值处为 -5dB。
- AFE_MN_PEAK_AGC_MODE_2: 线性放大喂给后续multinet的音频峰值处为 -4dB。
- AFE_MN_PEAK_AGC_MODE_3: 线性放大喂给后续multinet的音频峰值处为 -3dB。
- AFE_MN_PEAK_NO_AGC: 不做线性放大
- pcm_config: 根据 `afe->feed()` 喂入的音频结构进行配置,该结构体有三个成员变量需要配置:
- total_ch_num: 音频总的通道数total_ch_num = mic_num + ref_num。
- mic_num: 音频的麦克风通道数。目前仅支持配置为 1 或 2。
- ref_num: 音频的参考回路通道数,目前仅支持配置为 0 或 1。
### 3. 创建 afe_data
用户使用 `afe_handle->create_from_config(&afe_config)` 函数来获得数据句柄这将会在afe内部使用传入的参数即为上面第2步中获得的配置。
```
/**
* @brief Function to initialze a AFE_SR instance
*
* @param afe_config The config of AFE_SR
* @returns Handle to the AFE_SR data
*/
typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
```
### 4. feed 音频数据
在初始化 AFE 完成后,用户需要将音频数据使用 `afe_handle->feed()` 函数输入到 AFE 中进行处理。
输入的音频大小和排布格式可以参考 **输入音频** 这一步骤。
```
/**
* @brief Feed samples of an audio stream to the AFE_SR
*
* @Warning The input data should be arranged in the format of channel interleaving.
* The last channel is reference signal if it has reference data.
*
* @param afe The AFE_SR object to query
*
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
* `get_feed_chunksize`.
* @return The size of input
*/
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in);
```
获取音频通道数:
使用 `afe_handle->get_total_channel_num()` 函数可以获取需要传入 `afe_handle->feed()` 函数的总数据通道数。其返回值等于AFE_CONFIG_DEFAULT()中配置的 `pcm_config.mic_num + pcm_config.ref_num`
```
/**
* @brief Get the total channel number which be config
*
* @param afe The AFE_SR object to query
* @return The amount of total channels
*/
typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe);
```
### 5. fetch 音频数据
用户调用 `afe_handle->fetch()` 函数可以获取处理完成的单通道音频以及相关处理信息。
fetch 的数据采样点数目(采样点数据类型为 int16可以通过 `afe_handle->get_fetch_chunksize` 获取。
```
/**
* @brief Get the amount of each channel samples per frame that need to be passed to the function
*
* Every speech enhancement AFE_SR processes a certain number of samples at the same time. This function
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
*
* @param afe The AFE_SR object to query
* @return The amount of samples to feed the fetch function
*/
typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
```
`afe_handle->fetch()` 的函数声明如下:
```
/**
* @brief fetch enhanced samples of an audio stream from the AFE_SR
*
* @Warning The output is single channel data, no matter how many channels the input is.
*
* @param afe The AFE_SR object to query
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
*/
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
```
其返回值为结构体指针,结构体定义如下:
```
/**
* @brief The result of fetch function
*/
typedef struct afe_fetch_result_t
{
int16_t *data; // the data of audio.
int data_size; // the size of data. The unit is byte.
int wakeup_state; // the value is wakenet_state_t
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
int vad_state; // the value is afe_vad_state_t
int trigger_channel_id; // the channel index of output
int wake_word_length; // the length of wake word. It's unit is the number of samples.
int ret_value; // the return state of fetch function
void* reserved; // reserved for future use
} afe_fetch_result_t;
```
### 6. WakeNet 使用
当用户在唤醒后需要进行其他操作,比如离线或在线语音识别,这时候可以暂停 WakeNet 的运行,从而减轻 CPU 的资源消耗。
用户可以调用 `afe_handle->disable_wakenet(afe_data)` 来停止 WakeNet。 当后续应用结束后又可以调用 `afe_handle->enable_wakenet(afe_data)` 来开启 WakeNet。
另外ESP32S3 芯片,支持唤醒词切换。(注: ESP32 芯片只支持一个唤醒词,不支持切换)。在初始化 AFE 完成后ESP32S3 芯片可通过 `set_wakenet()`函数切换唤醒词。例如, `afe_handle->set_wakenet(afe_data, “wn9_hilexin”)` 切换到“Hi Lexin”唤醒词。具体如何配置多个唤醒词详见[flash_model](../flash_model/README_CN.md)
### 7. AEC 使用
AEC 的使用和 WakeNet 相似,用户可以根据自己的需求来停止或开启 AEC。
- 停止 AEC
afe->disable_aec(afe_data);
- 开启 AEC
afe->enable_aec(afe_data);

94
docs/check_doc_chars.py Executable file
View File

@ -0,0 +1,94 @@
#!/usr/bin/env python
#-*- coding: utf-8 -*-
#
# Copyright 2021 Espressif Systems (Shanghai) PTE LTD
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import sys, os, re
if sys.version_info[0] == 2:
reload(sys)
sys.setdefaultencoding('utf-8')
# allowed characters, include some chinese characters, symbol, and punctuation
at_allowed_chars_list = ['中文', '®', '', '', '', '', '', '×', '', '', '', '', '',"±","°"]
at_not_allowed_chars_list = re.compile(b'[^\x0a\x0d\x20-\x7e]')
at_file_white_list = ['index_of_abbreviations.rst']
def at_get_file_list(doc_path, subdir_file_list):
if os.path.isdir(doc_path):
file_list = os.listdir(doc_path)
else:
subdir_file_list.append(doc_path)
return subdir_file_list
for file in file_list:
cur_path = os.path.join(doc_path, file)
if os.path.isdir(cur_path):
at_get_file_list(cur_path, subdir_file_list)
else:
subdir_file_list.append(cur_path)
return subdir_file_list
def at_data_is_allowed_chars(match_info, data):
to_check_idx = match_info.span()
s_last_idx = -2
for cur_idx in to_check_idx:
if (cur_idx == s_last_idx + 1):
s_last_idx = cur_idx
continue
else:
chars_is_valid = 0
for chars in at_allowed_chars_list:
cur_allowed_data = chars.encode()
to_check_data_tail_idx = cur_idx + len(chars.encode())
to_check_data = data[cur_idx : to_check_data_tail_idx]
if cur_allowed_data == to_check_data:
chars_is_valid = 1
if chars_is_valid == 1:
return True
s_last_idx = cur_idx
return False
def at_check_doc_chars_validity(doc_name):
with open(doc_name, "rb") as fp:
for (lineno, data) in enumerate(fp):
match_info = re.search(at_not_allowed_chars_list, data)
if match_info:
if not at_data_is_allowed_chars(match_info, data):
print("\033[31mError: illegal character detected at %s:%d\033[0m" %(doc_name, lineno + 1))
print("raw data ----> %s\r\n" %data)
print("Allowed chars:")
for x in at_allowed_chars_list:
print(x, "---->", x.encode())
return False
pass
return True
def _main():
if len(sys.argv) == 2:
dst_path = os.path.abspath(sys.argv[1])
else:
dst_path = os.path.abspath('.') + "/en"
at_en_doc_file_list = at_get_file_list(dst_path, [])
for current_file in at_en_doc_file_list:
for file_basename in at_file_white_list:
if os.path.basename(current_file) == file_basename:
continue
else:
if at_check_doc_chars_validity(current_file) == False:
sys.exit(-1)
print("\033[1;32mDocument characters check passed! (%s)\033[0m" %dst_path)
if __name__ == '__main__':
_main()

36
docs/check_lang_folder_sync.sh Executable file
View File

@ -0,0 +1,36 @@
#!/usr/bin/env bash
#
# Check if folders with localized documentation are in sync
#
# 1. Traverse each folder with language version and generate a sorted list
# of all the files inside
# 2. Compare the sorted lists of files and flag differences
#
# Note:
# All differences between folders with language versions should be resolved
# before releasing documentation
#
RESULT=0
STARS='***************************************************'
find en -type f | cut -d/ -f2- | sort > file_list_en
find zh_CN -type f | cut -d/ -f2- | sort > file_list_zh_CN
# format is to display new or different filenames
DIFF_FORMAT="--unchanged-line-format= --old-line-format=[en]:%L --new-line-format=[zh_CN]:%L"
FOLDER_DIFFERENCES=$(diff $DIFF_FORMAT file_list_en file_list_zh_CN)
if ! [ -z "$FOLDER_DIFFERENCES" ]; then
echo "$STARS"
echo "Build failed due to the following differences in 'en' and 'zh_CN' folders:"
echo "$FOLDER_DIFFERENCES"
echo "$STARS"
echo "Please synchronize contents of 'en' and 'zh_CN' folders to contain files with identical names"
RESULT=1
fi
# remove temporary files
rm file_list_en file_list_zh_CN
exit $RESULT

67
docs/conf_common.py Executable file
View File

@ -0,0 +1,67 @@
# -*- coding: utf-8 -*-
#
# Common (non-language-specific) configuration for Sphinx
#
# This file is imported from a language-specific conf.py (ie en/conf.py or
# zh_CN/conf.py)
from __future__ import print_function, unicode_literals
import os.path
#ESP_DOCS_PATH = os.environ['ESP_DOCS_PATH']
try:
from esp_docs.conf_docs import * # noqa: F403,F401
except ImportError:
import os
import sys
sys.path.insert(0, os.path.abspath(ESP_DOCS_PATH))
from conf_docs import * # noqa: F403,F401
ESP32_DOCS = ['audio_front_end/README.rst',
'wake_word_engine/README.rst',
'wake_word_engine/ESP_Wake_Words_Customization.rst',
'speech_command_recognition/README.rst',
'flash_model/README.rst',
'audio_front_end/Espressif_Microphone_Design_Guidelines.rst',
'test_report/README.rst',
'performance_test/README.rst',
]
# format: {tag needed to include: documents to included}, tags are parsed from sdkconfig and peripheral_caps.h headers
conditional_include_dict = {
'esp32':ESP32_DOCS,
}
extensions += ['sphinx_copybutton',
# Note: order is important here, events must
# be registered by one extension before they can be
# connected to another extension
'esp_docs.esp_extensions.dummy_build_system',
'esp_docs.esp_extensions.run_doxygen',
]
# link roles config
github_repo = 'espressif/esp-sr'
# context used by sphinx_idf_theme
html_context['github_user'] = 'espressif'
html_context['github_repo'] = 'esp-sr'
idf_targets = ['esp32', 'esp32s2', 'esp32s3']
languages = ['en', 'zh_CN']
google_analytics_id = os.environ.get('CI_GOOGLE_ANALYTICS_ID', None)
project_homepage = 'https://github.com/espressif/esp-sr'
html_static_path = ['../_static']
# Extra options required by sphinx_idf_theme
project_slug = 'esp-sr'
versions_url = './_static/js/at_versions.js'
# Final PDF filename will contains target and version
pdf_file_prefix = u'esp-sr'

View File

@ -0,0 +1 @@
semphr.h:line: warning: argument 'pxStaticSemaphore' of command @param is not found in the argument list of xSemaphoreCreateCounting(uxMaxCount, uxInitialCount)

18
docs/en/404.rst Normal file
View File

@ -0,0 +1,18 @@
:orphan:
Page not Found
==============
:link_to_translation:`zh_CN:[中文]`
.. note::
We're sorry. The page you requested could not be found.
Please use menu on the left to navigate through documentation contents. Optionally type the phrase you are looking for in a search box above the menu and press enter.
.. figure:: ../_static/404-page__en.svg
:align: center
:alt: We're sorry. The page you requested could not be found.
:figclass: align-center
* :ref:`genindex`

View File

@ -0,0 +1,4 @@
{% extends '!layout.html' %}
{% block comments %}
<p style="text-align:center"><a href="https://www.espressif.com/en/company/documents/documentation_feedback?docId=4419&sections={{ title|striptags|e }} ({{ pagename }})&version={{ release }} ({{ version }})">Provide feedback about this document</a></p>
{% endblock %}

View File

@ -0,0 +1,69 @@
Espressif Microphone Design Guidelines
=======================================
:link_to_translation:`zh_CN:[中文]`
This document provides microphone design guidelines and suggestions for the ESP32-S3 series of audio development boards.
Electrical Performance
----------------------
#. Type: omnidirectional MEMS microphone
#. Sensitivity
- Under 1 Pa sound pressure, it should be no less than -38 dBV for analog microphones, and -26 dB for digital microphones.
- The tolerance should be controlled within ±2 dB, and within ±1 dB for microphone arrays.
#. Signal-to-noise ratio (SNR)
- No less than 62 dB. Higher than 64 dB is recommended.
- Frequency response fluctuates within ±3 dB from 50 to 16 kHz.
- PSRR should be larger than 55 dB for MEMS microphones.
Structure Design
----------------
#. The aperture or width of the microphone hole is recommended to be greater than 1 mm, the pickup pipe should be as short as possible, and the cavity should be as small as possible to ensure that the resonance frequency of the microphone and structural components is above 9 kHz.
#. The depth and diameter of the pickup hole are less than 4:1, and the thickness of the shell is recommended to be 1 mm. If the shell is too thick, the opening area must be increased.
#. The microphone hole must be protected by an anti-dust mesh.
#. Silicone sleeve or foam must be added between the microphone and the device shell for sealing and shockproofing, and an interference fit design is required to ensure the tightness of the microphone.
#. The microphone hole cannot be blocked. The bottom microphone hole needs to be increased in structure to prevent it from being blocked by the desktop.
#. The microphone should be placed far away from the speaker and other objects that can produce noise or vibration, and be isolated and buffered by rubber pads from the speaker sound cavity.
Microphone Array Design
-----------------------
#. Type: omnidirectional MEMS microphone. Use the same models from the same manufacturer for the array. Not recommended mixing different microphones.
#. The sensitivity difference among microphones in the array is within 3 dB.
#. The phase difference among the microphones in the array is controlled within 10°.
#. It is recommended to keep the structural design of each microphone in the array the same to ensure consistency.
#. Two-microphone solution: the distance between the microphones should be 4 ~ 6.5 cm, the axis connecting them should be parallel to the horizontal line, and the center of the two microphones should be horizontally as close as possible to the center of the product.
#. Three-microphone solution: the microphones are equally spaced and distributed in a perfect circle with the angle 120 degrees from each other, and the spacing should be 4 ~ 6.5 cm.
Microphone Structure Tightness
------------------------------
Use plasticine or other materials to seal the microphone pickup hole and compare how much the signals collected by the microphone decrease by before and after the seal. 25 dB is qualified, and 30 dB is recommended. Below are the test procedures.
#. Play white noise at 0.5 meters above the microphone, and keep the volume at the microphone 90 dB.
#. Use the microphone array to record for more than 10 s, and store it as recording file A.
#. Use plasticine or other materials to block the microphone pickup hole, record for more than 10 s, and store it as recording file B.
#. Compare the frequency spectrum of the two files and make sure that the overall attenuation in the 100 ~ 8 kHz frequency band is more than 25 dB.
Echo Reference Signal Design
----------------------------
#. It is recommended that the echo reference signal be as close to the speaker side as possible, and recover from the DAC post-stage and PA pre-stage.
#. When the speaker volume is at its maximum, the echo reference signal input to the microphone should not have saturation distortion. At the maximum volume, the speaker amplifier output THD is less than 10% at 100 Hz, less than 6% at 200 Hz, and less than 3% above 350 Hz.
#. When the speaker volume is at its maximum, the sound pressure picked up by the microphone does not exceed 102 dB @ 1 kHz.
#. The echo reference signal voltage does not exceed the maximum allowed input voltage of the ADC. If it is too high, an attenuation circuit should be added.
#. A low-pass filter should be added to introduce the reference echo signal from the output of the Class D power amplifier. The cutoff frequency of the filter is recommended to be more than 22 kHz.
#. When the volume is played at the maximum, the recovery signal peak value is -3 to -5 dB.
Microphone Array Consistency
----------------------------
It is required that the difference between the sampled signals of each microphone is less than 3 dB. Below are the test procedures.
#. Play white noise at 0.5 meters above the microphone, and keep the volume at the microphone 90 dB.
#. Use the microphone array to record for more than 10 s, and check whether the recording amplitude and audio sampling rate of each microphone are consistent.

View File

@ -0,0 +1,423 @@
Audio Front-end Framework
=========================
:link_to_translation:`zh_CN:[中文]`
Espressif Audio Front-end (AFE) algorithm framework is independently developed by ESPRESSIF AI Lab. Based on ESP32 series chips, the framework can provide high-quality and stable audio data.
Summary
-------
Espressif AFE provides the most convenient way to do audio front-end
processing on ESP32 series chips. Espressif AFE framework stably get
high-quality audio data for further wake-up or speech recognition.
Espressif AFE is divided into two sets of algorithms:
#. for speech recognition scenarios;
#. for voice communication scenarios. Shown as below:
- Speech recognition scenarios
.. figure:: ../../_static/AFE_SR_overview.png
:alt: overview
- Voice communication scenarios
.. figure:: ../../_static/AFE_VOIP_overview.png
:alt: overview
The data flow of Espressif AFE is also divided into two scenarios, shown
as below:
- Speech recognition scenarios
.. figure:: ../../_static/AFE_SR_workflow.png
:alt: overview
The workflow is as follows:
#. Use **ESP_AFE_SR_HANDLE** to create and initialize AFE
(``voice_communication_init`` needs to be configured as false)
#. AFE feed: Input audio data and will run AEC in the feed function
#. Internal: BSS/NS algorithm processing will be carried out.
#. AFE fetch: Return the audio data and the related information after processing. VAD processing and wake-up word detection will be carried out inside the fetch. The specific behavior depends on the config of ``afe_config_t`` structure.
.. note ::
``wakenet_Init`` and ``voice_communication_Init`` cannot be configured to true at the same time
- Voice communication scenarios
.. figure:: ../../_static/AFE_VOIP_workflow.png
:alt: overview
The workflow is as follows:
#. Use **ESP_AFE_VC_HANDLE** to create and initialize AFE (``voice_communication_init`` needs to be configured as true)
#. AFE feed: Input audio data and will run AEC in the feed function
#. Internal: BSS/NS algorithm processing will be carried out. If it's dual MIC, the miso algorithm processing will be carried out later.
#. AFE fetch: Return the audio data and the related information after processing. The AGC algorithm processing will be carried out. And the specific gain depends on the config of ``afe_config_t`` structure. If it's dual MIC, the NS algorithm processing will be carried out before AGC.
.. note ::
``wakenet_Init`` and ``voice_communication_Init`` cannot be configured to true at the same time
.. note ::
``afe->feed()`` and ``afe->fetch()`` are visible to users, while ``internal BSS/NS/MISO task`` is invisible to users.
* AEC runs in ``afe->feed()`` function; If aec_init is configured as false, BSS/NS will run in the afe->feed() function.
* BSS/NS/MISO is an independent task in AFE;
* The results of VAD/WakeNet and the audio data after processing are obtained by ``afe->fetch()`` function.
Select AFE Handle
~~~~~~~~~~~~~~~~~
Espressif AFE supports both single MIC and dual MIC scenarios, and the algorithm module can be flexibly configured. The internal task of single MIC applications is processed by NS, and the internal task of dual MIC applications is processed by BSS. If the dual microphone scenario is configured for voice communication
(i.e. ``wakenet_init=false, voice_communication_init=true``), the miso internal task will be added.
For the acquisition of AFE handle, there is a slight difference between speech recognition scenario and voice communication scenario:
- Speech recognition
::
esp_afe_sr_iface_t *afe_handle = &ESP_AFE_SR_HANDLE;
- Voice communication
::
esp_afe_sr_iface_t *afe_handle = &ESP_AFE_VC_HANDLE;
Input Audio Data
~~~~~~~~~~~~~~~~
The AFE supports two kinds of scenarios: single MIC and dual MIC. The number of channels can be configured according to the audio of ``afe->feed()``. Modify method: It can modify the ``pcm_config`` configuration in macro ``AFE_CONFIG_DEFAULT()``. It supports the following configuration combinations
.. note ::
It must meet ``total_ch_num = mic_num + ref_num`` :
::
total_ch_num=1, mic_num=1, ref_num=0
total_ch_num=2, mic_num=1, ref_num=1
total_ch_num=2, mic_num=2, ref_num=0
total_ch_num=3, mic_num=2, ref_num=1
.. note ::
total_ch_num: the number of total channels, mic_num: the number of microphone channels, ref_num: the number of reference channels
At present, the AEC only support one reference data , so ref_num can only be 0 or 1.
- AFE single MIC
- Input audio data format: 16KHz, 16bit, two channels (one is mic data, another is reference data) ; If AEC is not required and the audio does not contain reference data. The input data can only have one channel of MIC data, and the ref_num need to be set 0.
- The input data frame length will vary according to the algorithm module configured by the user. Users can use ``afe->get_feed_chunksize()`` to get the number of sampling points (the data type of sampling points is int16).
The input data is arranged as follows:
.. figure:: ../../_static/AFE_mode_0.png
:alt: input data of single MIC
:height: 0.7in
- AFE dual MIC
- Input audio data format: 16KHz, 16bit, three channels (two are mic data, another is reference data) ; If AEC is not required and the audio does not contain reference data. The input data can only have two channels of MIC data, and the ref_num need to be set 0.
- The input data frame length will vary according to the algorithm module configured by the user. Users can use ``afe->get_feed_chunksize()`` to get the number of sampling points (the data type of sampling points is int16).
The input data is arranged as follows:
.. figure:: ../../_static/AFE_mode_other.png
:alt: input data of dual MIC
:height: 0.75in
.. note::
the converted data size is: ``afe->get_feed_chunksize * channel number * sizeof(short)``
AEC Introduction
~~~~~~~~~~~~~~~~
The AEC (Acoustic Echo Cancellation) algorithm supports maximum two-mic processing, which can effectively remove the echo in the mic input signal, and help with further speech recognition.
NS (Noise Suppression)
~~~~~~~~~~~~~~~~~~~~~~
NS algorithm supports single-channel processing and can suppress the non-human noise in single-channel audio, especially for steady noise.
BSS (Blind Source Separation)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
BSS algorithm supports dual-channel processing, which can well separate the target sound source from the rest of the interference sound, so as to extract the useful audio signal and ensure the quality of the subsequent speech.
MISO (Multi Input Single Output)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Miso algorithm supports dual channel input and single channel output. It is used to select a channel of audio output with high signal-to-noise ratio when there is no wakenet enable in the dual mic scene.
VAD (Voice Activity Detection)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
VAD algorithm supports real-time output of the voice activity state of the current frame.
AGC (Automatic Gain Control)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
AGC dynamically adjusts the amplitude of the output audio, and amplifies the output amplitude when a weak signal is input; When the input signal reaches a certain strength, the output amplitude will be compressed.
WakeNet or Bypass
~~~~~~~~~~~~~~~~~
Users can choose whether to detect wake words in AFE. When calling ``afe->disable_wakenet(afe_data)``, it will enter bypass mode, and the WakeNet will not run.
Output Audio
~~~~~~~~~~~~
The output audio of AFE is single-channel data. In the speech recognition scenario, AFE will output single-channel data with human voice while WakeNet is enabled. In the voice communication scenario, single channel data with higher signal-to-noise ratio will be output.
Quick Start
-----------
Define afe_handle
~~~~~~~~~~~~~~~~~~~~
``afe_handle`` is the function handle that the user calls the AFE interface. Therefore, the first step is to obtain ``afe_handle``.
- Speech recognition
::
esp_afe_sr_iface_t *afe_handle = &ESP_AFE_SR_HANDLE;
- Voice communication
::
esp_afe_sr_iface_t *afe_handle = &ESP_AFE_VC_HANDLE;
Configure AFE
~~~~~~~~~~~~~
Get the configuration of AFE:
::
afe_config_t afe_config = AFE_CONFIG_DEFAULT();
Users can adjust the switch of each algorithm module and its corresponding parameters in ``afe_config``:
::
#define AFE_CONFIG_DEFAULT() { \
.aec_init = true, \
.se_init = true, \
.vad_init = true, \
.wakenet_init = true, \
.voice_communication_init = false, \
.voice_communication_agc_init = false, \
.voice_communication_agc_gain = 15, \
.vad_mode = VAD_MODE_3, \
.wakenet_model_name = NULL, \
.wakenet_mode = DET_MODE_2CH_90, \
.afe_mode = SR_MODE_LOW_COST, \
.afe_perferred_core = 0, \
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config.total_ch_num = 3, \
.pcm_config.mic_num = 2, \
.pcm_config.ref_num = 1, \
}
- aec_init: Whether the AEC algorithm is enabled.
- se_init: Whether the BSS/NS algorithm is enabled.
- vad_init: Whether the VAD algorithm is enabled ( It can only be used in speech recognition scenarios ).
- wakenet_init: Whether the wake algorithm is enabled.
- voice_communication_init: Whether voice communication is enabled. It cannot be enabled with wakenet_init at the same time.
- voice_communication_agc_init: Whether the AGC is enabled in voice communication.
- voice_communication_agc_gain: The gain of AGC ( unit: dB )
- vad_mode: The VAD operating mode. The bigger, the more radical.
- wakenet_model_name: Its default value is NULL in macro ``AFE_CONFIG_DEFAULT()``. At first, you need to choose WakeNet model through ``idf.py menuconfig``. Then you need to assign a specific model name to this place before ``afe_handle->create_from_config``. The type of value is string. Please refer to `flash_model <../flash_model/README.md>`__
.. note::
In the example, we use the ``esp_srmodel_filter()`` to get wakenet_model_name. If you choose the multiple wakenet models coexist through menuconfig, this function will return a model name randomly.
- wakenet_mode: Wakenet mode. It indicate the number of wake-up channels according to the number of MIC channels.
- afe_mode: Espressif AFE supports two working modes: SR_MODE_LOW_COST, SR_MODE_HIGH_PERF. See the afe_sr_mode_t enumeration for details.
- SR_MODE_LOW_COST: The quantified version occupies less resources.
- SR_MODE_HIGH_PERF: The non-quantified version occupies more resources.
**ESP32 only supports SR_MODE_HIGH_PERF; And ESP32S3 supports both of the modes**
- afe_perferred_core: The internal BSS/NS/MISO algorithm of AFE will be running on which CPU core.
- afe_perferred_priority: The running priority of BSS/NS/MISO algorithm task.
- afe_ringbuf_size: Configuration of internal ringbuf size.
- memory_alloc_mode: Memory allocation mode. Three values can be configured:
- AFE_MEMORY_ALLOC_MORE_INTERNAL: More memory is allocated from internal ram.
- AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE: Part of memory is allocated from internal psram.
- AFE_MEMORY_ALLOC_MORE_PSRAM: Most of memory is allocated from external psram.
- agc_mode: Configuration for linear audio amplification which be used in speech recognition. It only takes effect when wakenet_init is enabled. Four values can be configured:
- AFE_MN_PEAK_AGC_MODE_1: Linearly amplify the audio which will fed to multinet. The peak value is -5 dB.
- AFE_MN_PEAK_AGC_MODE_2: Linearly amplify the audio which will fed to multinet. The peak value is -4 dB.
- AFE_MN_PEAK_AGC_MODE_3: Linearly amplify the audio which will fed to multinet. The peak value is -3 dB.
- AFE_MN_PEAK_NO_AGC: No amplification.
- pcm_config: Configure according to the audio that fed by ``afe->feed()``. This structure has three member variables to configure:
- total_ch_num: Total number of audio channels, total_ch_num = mic_num + ref_num.
- mic_num: The number of microphone channels. It only can be set to 1 or 2.
- ref_num: The number of reference channels. It only can be set to 0 or 1.
Create afe_data
~~~~~~~~~~~~~~~~~~
The user uses the ``afe_handle->create_from_config(&afe_config)`` function to obtain the data handle, which will be used internally in afe, and the parameters passed in are the configurations obtained in step 2 above.
::
/**
* @brief Function to initialze a AFE_SR instance
*
* @param afe_config The config of AFE_SR
* @returns Handle to the AFE_SR data
*/
typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
Feed Audio Data
~~~~~~~~~~~~~~~~~~
After initializing AFE, users need to input audio data into AFE by ``afe_handle->feed()`` function for processing.
The input audio size and layout format can refer to the step **Input Audio data**.
::
/**
* @brief Feed samples of an audio stream to the AFE_SR
*
* @Warning The input data should be arranged in the format of channel interleaving.
* The last channel is reference signal if it has reference data.
*
* @param afe The AFE_SR object to query
*
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
* `get_feed_chunksize`.
* @return The size of input
*/
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in);
Get the number of audio channels:
``afe_handle->get_total_channel_num()`` function can provide the number of channels that need to be put into ``afe_handle->feed()`` function. Its return value is equal to ``pcm_config.mic_num + pcm_config.ref_num`` in AFE_CONFIG_DEFAULT()
::
/**
* @brief Get the total channel number which be config
*
* @param afe The AFE_SR object to query
* @return The amount of total channels
*/
typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe);
Fetch Audio Data
~~~~~~~~~~~~~~~~~
Users can get the processed single-channel audio and related information by ``afe_handle->fetch()`` function.
The number of data sampling points of fetch (the data type of sampling point is int16) can be got by ``afe_handle->get_fetch_chunksize``.
::
/**
* @brief Get the amount of each channel samples per frame that need to be passed to the function
*
* Every speech enhancement AFE_SR processes a certain number of samples at the same time. This function
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
*
* @param afe The AFE_SR object to query
* @return The amount of samples to feed the fetch function
*/
typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
The declaration of ``afe_handle->fetch()`` is as follows:
::
/**
* @brief fetch enhanced samples of an audio stream from the AFE_SR
*
* @Warning The output is single channel data, no matter how many channels the input is.
*
* @param afe The AFE_SR object to query
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
*/
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
Its return value is a pointer of structure, and the structure is defined as follows:
::
/**
* @brief The result of fetch function
*/
typedef struct afe_fetch_result_t
{
int16_t *data; // the data of audio.
int data_size; // the size of data. The unit is byte.
int wakeup_state; // the value is wakenet_state_t
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
int vad_state; // the value is afe_vad_state_t
int trigger_channel_id; // the channel index of output
int wake_word_length; // the length of wake word. It's unit is the number of samples.
int ret_value; // the return state of fetch function
void* reserved; // reserved for future use
} afe_fetch_result_t;
Usage Of WakeNet
~~~~~~~~~~~~~~~~~
When users need to perform other operations after wake-up, such as offline or online speech recognition. They can pause the operation of WakeNet to reduce the CPU resource consumption.
Users can call ``afe_handle->disable_wakenet(afe_data)`` to stop WakeNet, or call ``afe_handle->enable_wakenet(afe_data)`` to enable WakeNet.
In addition, ESP32S3 chip supports switching between wakenet words. (Note: ESP32 chip only supports one wake-up word and does not support switching). After AFE initialization, the ESP32S3 can switch wakenet word by ``afe_handle->set_wakenet()``. For example, ``afe_handle->set_wakenet(afe_data, "wn9_hilexin")`` can switch to the "Hi Lexin". How to configure multiple wakenet words, please refer to: `flash_model <../flash_model/README.md>`__
Usage Of AEC
~~~~~~~~~~~~~
The usage of AEC is similar to that of WakeNet. Users can disable or enable AEC according to requirements.
- Disable AEC
afe->disable_aec(afe_data);
- Enable AEC
afe->enable_aec(afe_data);

24
docs/en/conf.py Executable file
View File

@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
#
# English Language RTD & Sphinx config file
#
# Uses ../conf_common.py for most non-language-specific settings.
# Importing conf_common adds all the non-language-specific
# parts to this conf module
import sys
import os
sys.path.insert(0, os.path.abspath('..'))
from conf_common import * # noqa: F401, F403 - need to make available everything from common
# General information about the project.
project = u'ESP-SR User Guide'
copyright = u'2016 - 2022, Espressif Systems (Shanghai) Co., Ltd.'
pdf_title = u'ESP-SR User Guide'
# Final PDF filename will contains target and version
pdf_file_prefix = u'esp-sr'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
language = 'en'

View File

@ -0,0 +1,180 @@
Model Loading Method
====================
:link_to_translation:`zh_CN:[中文]`
In esp-sr, both WakeNet and MultiNet will use a large amount of model data, and the model data is located in *ESP-SR_PATH/model/*. Currently esp-sr supports the following model loading methods:
ESP32:
- Load directly from Flash
ESP32S3:
- Load from Flash spiffs partition
- Load from external SDCard
So that on ESP32S3 you can:
- Greatly reduce the size of the user application APP BIN
- Supports the selection of up to two wake words
- Support online switching of Chinese and English Speech Command Recognition
- Convenient for users to perform OTA
- Supports reading and changing models from SD card, which is more convenient and can reduce the size of module Flash used in the project
- When the user is developing the code, when the modification does not involve the model, it can avoid flashing the model data every time, greatly reducing the flashing time and improving the development efficiency
Model Configuration Introduction
--------------------------------
Run *idf.py menuconfig* navigate to *ESP Speech Recognition*:
.. figure:: ../../_static/model-1.png
:alt: overview
overview
Model Data Path
~~~~~~~~~~~~~~~
This option is only available on ESP32S3. It indicates the storage location of the model data. It supports the choice of ``spiffs partition`` or ``SD Card``.
- *spiffs partition* means that the model data is stored in the Flash spiffs partition, and the model data will be loaded from the Flash spiffs partition
- ``SD Card`` means that the model data is stored in the SD card, and the model data will be loaded from the SD Card
Use AFE
~~~~~~~
This option needs to be turned on. Users do not need to modify it. Please keep the default configuration.
Use Wakenet
~~~~~~~~~~~
This option is turned on by default. When the user only uses ``AEC`` or ``BSS``, etc., and does not need to run ``WakeNet`` or ``MultiNet``, please turn off this option, which will reduce the size of the project firmware.
- Select wake words by menuconfig, ``ESP Speech Recognition -> Select wake words``. The model name of wake word in parentheses is used to initialize wakenet handle. |select wake wake|
- If you want to select multiple wake words, please select ``Load Multiple Wake Words`` ( **Note this option only supports ESP32S3**) |multi wake wake| Then you can select multiple wake words at the same time |image1|
For more details, please refer to `WakeNet <../wake_word_engine/README.md>`__ .
Use Multinet
~~~~~~~~~~~~
This option is turned on by default. When users only use WakeNet or other algorithm modules, please turn off this option, which will reduce the size of the project firmware in some cases.
ESP32 chip only supports Chinese Speech Commands Recognition.
ESP32S3 supports Chinese and English Speech Commands Recognition, and supports Chinese and English recognition model switching.
- Chinese Speech Commands Model
Chinese Speech Commands Recognition model selection.
ESP32 supports:
- None
- chinese single recognition (MultiNet2)
ESP32S3 supports:
- None
- chinese single recognition (MultiNet4.5)
- chinese single recognition (MultiNet4.5 quantized with 8-bit)
- English Speech Commands Model
English Speech Commands Recognition model selection.
This option does not support ESP32.
ESP32S3 Supports:
- None
- english recognition (MultiNet5 quantized with 8-bit, depends on WakeNet8)
- Add Chinese speech commands
The user needs to add Chinese Speech Command words to this item when ``Chinese Speech Commands Model`` is not ``None``.
- Add English speech commands
The user needs to add English Speech Command words to this item when ``Chinese Speech Commands Model`` is not ``None``.
For more details, please refer to `MultiNet <../speech_command_recognition/README.md>`__ .
How To Use
----------
Here is an introduction to the code implementation of model data loading in the project. If you want get more detailes, please refer to esp-skainet examples.
ESP32
~~~~~
| When the user uses ESP32, since it only supports loading the model data directly from the Flash, the model data in the code will automatically read the required data from the Flash according to the address.
| Now The ESP32S3 API is compatible with ESP32. You can refer to the ESP32S3 method to load and initialize the model.
ESP32S3
~~~~~~~
#. Write a partition table:
::
model, data, spiffs, , SIZE,
Among them, ``SIZE`` can refer to the recommended size when the user uses ``idf.py build`` to compile, for example:
::
Recommended model partition size: 500K
After completing the above configuration, the project will automatically generate ``model.bin`` after the project is compiled, and flash it to the spiffs partition.
#. Initialize the spiffs partition User can use ``esp_srmodel_init()`` API to initialize spiffs and return all loaded models.
- base_path: The model storage ``base_path`` is ``srmodel`` and cannot be changed
- partition_label: The partition label of the model is ``model``, which needs to be consistent with the ``Name`` in the above partition table
**Note: After the user changes the model, be sure to run ``idf.py clean`` before compiling again.**
.. _esp32s3-1:
ESP32S3
-------
::
//
// step1: initialize spiffs and return models in spiffs
//
srmodel_list_t *models = esp_srmodel_init("model");
//
// step2: select the specific model by keywords
//
char *wn_name = esp_srmodel_filter(models, ESP_WN_PREFIX, NULL); // select wakenet model
char *nm_name = esp_srmodel_filter(models, ESP_MN_PREFIX, NULL); // select multinet model
char *alexa_wn_name = esp_srmodel_filter(models, ESP_WN_PREFIX, "alexa"); // select wakenet with "alexa" wake word.
char *en_mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_ENGLISH); // select english multinet model
char *cn_mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_CHINESE); // select english multinet model
// It also works if you use the model name directly in your code.
char *my_wn_name = "wn9_hilexin"
// we recommend you to check that it is loaded correctly
if (!esp_srmodel_exists(models, my_wn_name))
printf("%s can not be loaded correctly\n")
//
// step3: initialize model
//
esp_wn_iface_t *wakenet = esp_wn_handle_from_name(wn_name);
model_iface_data_t *wn_model_data = wakenet->create(wn_name, DET_MODE_2CH_90);
esp_mn_iface_t *multinet = esp_mn_handle_from_name(mn_name);
model_iface_data_t *mn_model_data = multinet->create(mn_name, 6000);
.. |select wake wake| image:: ../../_static/wn_menu1.png
.. |multi wake wake| image:: ../../_static/wn_menu2.png
.. |image1| image:: ../../_static/wn_menu3.png

29
docs/en/index.rst Normal file
View File

@ -0,0 +1,29 @@
:link_to_translation:`zh_CN:[中文]`
This is LEXIN `ESP-SR <https://github.com/espressif/esp-sr>` This document will introduce LEXIN's AI voice solution based on ESP32 series chip. From front-end audio processing, to voice command word recognition, from hardware design suggestions, to performance testing methods, it is a comprehensive introduction to Loxin's systematic work on AI speech, and provides a strong reference for users to build AIoT applications on Loxin ESP32 series chips and development boards.
Lexin AFE algorithm has passed the Software Audio Front-End certification for Amazon Alexa built-in devices. The built-in wake-up module in AFE algorithm can realize local voice wake-up function and support wake-up word customization. Lexin's voice command word recognition model can support up to 200 English and Chinese command words, and the command words can be modified during operation, bringing great flexibility to the application.
Based on years of hardware design and development experience, Loxin can provide voice development board review service for customers, and will be happy to test and tune the development board for customers to show the optimal performance of the algorithm. Customers can also conduct in-depth evaluation of the development board and the whole product according to the test methods and self-test results provided by Loxin.
.. only:: html
**This document only contains the ESP-AT usage** for the chip. For other chips, please select your target chip from the drop-down menu at the top left of the page.
.. only:: latex
**This document contains ESP-AT usage** for the chip only.
.. toctree::
:hidden:
AFE acoustic front-end algorithm <audio_front_end/README>
Wake word model <wake_word_engine/README>
Customized wake words <wake_word_engine/ESP_Wake_Words_Customization>
Speech commands <speech_command_recognition/README>
Model loading method <flash_model/README>
Microphone Design Guidelines <audio_front_end/Espressif_Microphone_Design_Guidelines>
Test Reports <test_report/README>
Performance Testing <performance_test/README>
Translated with www.DeepL.com/Translator (free version)

View File

@ -0,0 +1,164 @@
Performance Test
================
:link_to_translation:`zh_CN:[中文]`
AFE
---
Resource Occupancy(ESP32)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-----------------+-----------------+-----------------+-----------------+
| algorithm Type | RAM | Average cpu | Frame Length |
| | | loading(compute | |
| | | with 2 cores) | |
+=================+=================+=================+=================+
| AEC(HIGH_PERF) | 114 KB | 11% | 32 ms |
+-----------------+-----------------+-----------------+-----------------+
| NS | 27 KB | 5% | 10 ms |
+-----------------+-----------------+-----------------+-----------------+
| AFE Layer | 73 KB | | |
+-----------------+-----------------+-----------------+-----------------+
Resource Occupancy(ESP32S3)
~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-----------------+-----------------+-----------------+-----------------+
| algorithm Type | RAM | Average cpu | Frame Length |
| | | loading(compute | |
| | | with 2 cores) | |
+=================+=================+=================+=================+
| AEC(LOW_COST) | 152.3 KB | 8% | 32 ms |
+-----------------+-----------------+-----------------+-----------------+
| AEC(HIGH_PERF) | 166 KB | 11% | 32 ms |
+-----------------+-----------------+-----------------+-----------------+
| BSS(LOW_COST) | 198.7 KB | 6% | 64 ms |
+-----------------+-----------------+-----------------+-----------------+
| BSS(HIGH_PERF) | 215.5 KB | 7% | 64 ms |
+-----------------+-----------------+-----------------+-----------------+
| NS | 27 KB | 5% | 10 ms |
+-----------------+-----------------+-----------------+-----------------+
| MISO | 56 KB | 8% | 16 ms |
+-----------------+-----------------+-----------------+-----------------+
| AFE Layer | 227 KB | | |
+-----------------+-----------------+-----------------+-----------------+
WakeNet
-------
.. _resource-occupancyesp32-1:
Resource Occupancy(ESP32)
~~~~~~~~~~~~~~~~~~~~~~~~~
+-------------+-------------+-------------+-------------+-------------+
| Model Type | Parameter | RAM | Average | Frame |
| | Num | | Running | Length |
| | | | Time per | |
| | | | Frame | |
+=============+=============+=============+=============+=============+
| Quantised | 41 K | 15 KB | 5.5 ms | 30 ms |
| WakeNet5 | | | | |
+-------------+-------------+-------------+-------------+-------------+
| Quantised | 165 K | 20 KB | 10.5 ms | 30 ms |
| WakeNet5X2 | | | | |
+-------------+-------------+-------------+-------------+-------------+
| Quantised | 371 K | 24 KB | 18 ms | 30 ms |
| WakeNet5X3 | | | | |
+-------------+-------------+-------------+-------------+-------------+
.. _resource-occupancyesp32s3-1:
Resource Occupancy(ESP32S3)
~~~~~~~~~~~~~~~~~~~~~~~~~~~
+----------------+-------+---------+----------------+--------------+
| Model Type | RAM | PSRAM | Average | Frame Length |
| | | | Running Time | |
| | | | per Frame | |
+================+=======+=========+================+==============+
| Quantised | 50 KB | 1640 KB | 10.0 ms | 32 ms |
| WakeNet8 @ 2 | | | | |
| channel | | | | |
+----------------+-------+---------+----------------+--------------+
| Quantised | 16 KB | 324 KB | 3.0 ms | 32 ms |
| WakeNet9 @ 2 | | | | |
| channel | | | | |
+----------------+-------+---------+----------------+--------------+
| Quantised | 20 KB | 347 KB | 4.3 ms | 32 ms |
| WakeNet9 @ 3 | | | | |
| channel | | | | |
+----------------+-------+---------+----------------+--------------+
Performance
~~~~~~~~~~~
+-------------+-------------+-------------+-------------+-------------+
| Distance | Quiet | Stationary | Speech | AEC |
| | | Noise (SNR | Noise (SNR | I |
| | | = 4 dB) | = 4 dB) | nterruption |
| | | | | (-10 dB) |
+=============+=============+=============+=============+=============+
| 1 m | 98% | 96% | 94% | 96% |
+-------------+-------------+-------------+-------------+-------------+
| 3 m | 98% | 96% | 94% | 94% |
+-------------+-------------+-------------+-------------+-------------+
False triggering rate: 1 time in 12 hours
**Note**: We use the ESP32-S3-Korvo V4.0 development board and the WakeNet9(Alexa) model in our test.
MultiNet
--------
.. _resource-occupancyesp32-2:
Resource Occupancy(ESP32)
~~~~~~~~~~~~~~~~~~~~~~~~~~
+-------------+-------------+-------------+-------------+-------------+
| Model Type | Internal | PSRAM | Average | Frame |
| | RAM | | Running | Length |
| | | | Time per | |
| | | | Frame | |
+=============+=============+=============+=============+=============+
| MultiNet 2 | 13.3 KB | 9KB | 38 ms | 30 ms |
+-------------+-------------+-------------+-------------+-------------+
.. _resource-occupancyesp32s3-2:
Resource Occupancy(ESP32S3)
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+-------------+-------------+-------------+-------------+-------------+
| Model Type | Internal | PSRAM | Average | Frame |
| | RAM | | Running | Length |
| | | | Time per | |
| | | | Frame | |
+=============+=============+=============+=============+=============+
| MultiNet 4 | 16.8KB | 1866 KB | 18 ms | 32 ms |
+-------------+-------------+-------------+-------------+-------------+
| MultiNet 4 | 10.5 KB | 1009 KB | 11 ms | 32 ms |
| Q8 | | | | |
+-------------+-------------+-------------+-------------+-------------+
| MultiNet 5 | 16 KB | 2310 KB | 12 ms | 32 ms |
| Q8 | | | | |
+-------------+-------------+-------------+-------------+-------------+
Performance with AFE
~~~~~~~~~~~~~~~~~~~~
+-----------+-----------+-----------+-----------+-----------+
| Model | Distance | Quiet | S | Speech |
| Type | | | tationary | Noise |
| | | | Noise | (SNR = 4 |
| | | | (SNR = 4 | dB) |
| | | | dB) | |
+===========+===========+===========+===========+===========+
| MultiNet | 3 m | 98% | 93% | 92% |
| 4 | | | | |
+-----------+-----------+-----------+-----------+-----------+
| MultiNet | 3 m | 94% | 92% | 91% |
| 4 Q8 | | | | |
+-----------+-----------+-----------+-----------+-----------+

View File

@ -0,0 +1,242 @@
MultiNet Introduction
=====================
:link_to_translation:`zh_CN:[中文]`
MultiNet is a lightweight model designed to realize speech commands
recognition offline on ESP32 series. Now, up to 200 speech commands,
including customized commands, are supported.
* Support Chinese and English speech commands recognition (esp32s3 is required for English speech commands recognition)
* Support user-defined commands
* Support adding / deleting / modifying commands during operation
* Up to 200 commands are supported
* It supports single recognition and continuous recognition
* Lightweight and low resource consumption
* Low delay, within 500ms
* Support online Chinese and English model switching (esp32s3 only)
* The model is partitioned separately to support users to apply OTA
Overview
-----------
The MultiNet input is the audio processed by the audio-front-end
algorithm (AFE), with the format of 16KHz, 16bit and mono. By
recognizing the audio, you can correspond to the corresponding Chinese
characters or English words.
The following table shows the model support of Espressif SoCs:
+---------+-----------+-------------+---------------+-------------+
| Chip | ESP32 | ESP32S3 |
+=========+===========+=============+===============+=============+
| Model | MultiNet2 | MultiNet4.5 | MultiNet4.5Q8 | MultiNet5Q8 |
+---------+-----------+-------------+---------------+-------------+
| Chinese | √ | √ | √ | √ |
+---------+-----------+-------------+---------------+-------------+
| English | | | | √ |
+---------+-----------+-------------+---------------+-------------+
.. note::
Note: the model ending with Q8 represents the 8bit version of the model, means more lightweight.
Commands Recognition Process
-------------------------------
Please see the flow diagram below:
.. figure:: ../../_static/multinet_workflow.png
:alt: speech_command-recognition-system
speech_command-recognition-system
User Guide
-------------
Requirements of speech commands
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
- The recommended length of Chinese is generally 4-6 Chinese characters. Too short leads to high false recognition rate and too long is inconvenient for users to remember
- The recommended length of English is generally 4-6 words
- Mixed Chinese and English is not supported in command words
- Currently, up to 200 command words are supported
- The command word cannot contain Arabic numerals and special characters
- Avoid common command words like "hello"
- The greater the pronunciation difference of each Chinese character / word in the command word, the better the performance
Speech commands customization method
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
* Support a variety of speech commands customization methods
* Support dynamic addition / deletion / modification of speech commands
Format of Speech commands
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Speech commands string need to meet specific formats, as follows:
- Chines
Chinese speech commands need to use Chinese Pinyin, and there should be a space between the Pinyin spelling of each word.
In addition, we also provide corresponding tools for users to convert Chinese characters into pinyin. See details:
- English
English speech commands need to be represented by specific phonetic symbols. The phonetic symbols of each word are separated by spaces, such as "turn on the light", which needs to be written as "TkN nN jc LiT".
**We provide specific conversion rules and tools. For details, please refer to the English G2P** `tool <../../tool/multinet_g2p.py>`__.
Set speech commands offline
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Multinet supports flexible speech commands setting methods. No matter which way users set speech commands (code / network / file), they only need to call the corresponding API.
Here we provide two methods of adding speech commands:
- Use ``menuconfig``
Users can refer to the example in ESP-Skainet, users can define their own speech commands by ``idf.py menuconfig -> ESP Speech Recognition-> Add Chinese speech commands/Add English speech commands``.
.. figure:: ../../_static/menuconfig_add_speech_commands.png
:alt: menuconfig_add_speech_commands
menuconfig_add_speech_commands
Please note that a single ``Command ID`` can support multiple phrases. For example, "da kai kong tiao" and "kai kong tiao" have the same meaning, you can write them in the entry corresponding to the same command ID, and separate the adjacent entries with the English character "," without spaces before and after ",".
Then call the following API:
::
/**
* @brief Update the speech commands of MultiNet by menuconfig
*
* @param multinet The multinet handle
*
* @param model_data The model object to query
*
* @param langugae The language of MultiNet
*
* @return
* - ESP_OK Success
* - ESP_ERR_INVALID_STATE Fail
*/
esp_err_t esp_mn_commands_update_from_sdkconfig(esp_mn_iface_t *multinet, const model_iface_data_t *model_data);
- Add speech commands in the code
Users can refer to example in ESP-Skainet for this method of adding speech commands.
In this method, users directly set the speech command words in the code and transmits them to multinet. In the actual development and products, the user can transmit the required speech commands through various possible ways such as network / UART / SPI and change the speech commands.
Set speech commands online
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
MultiNet supports online dynamic addition / deletion / modification of speech commands during operation, without changing models or adjusting parameters. For details, please refer to the example in ESP-Skainet.
Please refer to
`esp_mn_speech_commands <../../src/esp_mn_speech_commands.c>`__ for
details of APIs:
Run speech commands recognition
----------------------------------
Speech commands recognition needs to be run together with the audio front-end (AFE) in esp-sr (WakeNet needs to be enabled in AFE). For the use of AFE, please refer to the document:
`AFE Introduction and Use <../audio_front_end/README_CN.md>`__
MultiNet Initialization
~~~~~~~~~~~~~~~~~~~~~~~~~~~
- Initialize multinet model
- Set speech commands
Please refer #3.
Run MultiNet
~~~~~~~~~~~~~
When users uses AFE and enables wakenet, then can use MultiNet. And
there are the following requirements:
* The frame length of MultiNet is equal to the AFE fetch frame length
* The audio format supported is 16KHz, 16bit, mono. The data obtained by AFE fetch is also in this format
- Get the frame length that needs to be passed into MultiNet
::
int mu_chunksize = multinet->get_samp_chunksize(model_data);
- MultiNet detect
We send the data from AFE fetch to the following API:
::
esp_mn_state_t mn_state = multinet->detect(model_data, buff);
The lengthof ``buff`` is ``mu_chunksize * sizeof(int16_t)``.
The detect result of MultiNet
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Speech commands recognition supports two basic modes:
* Single recognition
* Continuous recognition
Speech command recognition must be used with WakeNet. After wake-up, MultiNet detection can be run.
When the MultiNet is running, it will return the recognition status of the current frame in real time ``mn_state``, which is currently divided into the following identification states:
- ESP_MN_STATE_DETECTING
This status indicates that the MultiNet is detecting but target
speech command word has not been recognized.
- ESP_MN_STATE_DETECTED
This status indicates that the target speech command has been recognized. At this time, the user can call ``get_results`` interface obtains the identification results.
::
esp_mn_results_t *mn_result = multinet->get_results(model_data);
The information identifying the result is stored in the return value of the ``get_result`` API, the data type of the return value is as follows:
::
typedef struct{
esp_mn_state_t state;
int num; // The number of phrase in list, num<=5. When num=0, no phrase is recognized.
int phrase_id[ESP_MN_RESULT_MAX_NUM]; // The list of phrase id.
float prob[ESP_MN_RESULT_MAX_NUM]; // The list of probability.
} esp_mn_results_t;
- ``state`` is the recognition status of the current frame
- ``num`` means the number of recognized commands, ``num`` <= 5, up to 5 possible results are returned
- ``phrase_id`` means the Phrase ID of speech commands
- ``prob`` meaNS the recognition probability of the recognized entries, which is arranged from large to small
Users can use ``phrase_id[0]`` and ``prob[0]`` get the recognition result with the highest probability.
- ESP_MN_STATE_TIMEOUT
This status means that the speech commands has not been detected for a long time and will exit automatically Wait for the next wake-up.
* Therefore:
* Exit the speech recognition when the return status is ``ESP_MN_STATE_DETECTED``, it is single recognition mode;
* Exit the speech recognition when the return status is ``ESP_MN_STATE_TIMEOUT``, it is continuous recognition mode;
Other configurations
-----------------------
Threshold setting
~~~~~~~~~~~~~~~~~~~~~
::
This function is still under development.

View File

@ -0,0 +1,4 @@
Test Methods and Test Reports
==============================
:link_to_translation:`zh_CN:[中文]`

View File

@ -0,0 +1,81 @@
Espressif Speech Wake-up Solution Customization Process
========================================================
:link_to_translation:`zh_CN:[中文]`
Speech Wake Word Customization Process
---------------------------------------
Espressif provides users with the offline wake word customization service, which allows users to use both publicly available wake words (such as "Hi Lexin", "Alexa", and "hi,ESP") and customized wake words.
#. If you want to use publicly available wake words for commercial use
- Please check the wake words provided in `esp-sr <https://github.com/espressif/esp-sr>`__;
- We will continue to provide more and more wake words that are free for commercial use.
#. If you want to use custom wake words, we can also provide the offline
wake word customization service.
- If you provide a training corpus
- It must consist of at least 20,000 qualified corpus entries(see the section below for detailed requirements);
- It will take two to three weeks for Espressif to train and optimize the corpus after the hardware design meets our requirement;
- It will be delivered in a static library of wake word;
- Espressif will charge training fees based on the scale of your production.
- Otherwise
- Espressif will collect and provide all the training corpus;
- Espressif will deliver a static library file of successfully trained wake word to you, but won't share the corpus;
- It will take around three weeks to collect and train the corpus;
- Espressif will charge training fees (corpus collecting fees included) based on the scale of your production.
- The above time is subject to change depending on the project.
- Espressif will only charge a one-time customization fee depending on the number of wake words you customize and the scale of your production, and will not charge license fees for the quantity and time of use. Please email us at `sales@espressif.com <sales@espressif.com>`__ for details of the fee.
#. If you want to use offline command words
- Please set them by yourself referring to `esp-sr <https://github.com/espressif/esp-sr/tree/c5896943ea278195968c93c8b3466c720e641ebc speech_command_recognition>`__ algorithm. They do not need additional customization.
- Similar to speech wake words, the effect of command words is also related to hardware designs, so please refer to *Espressif MIC Design Guidelines*.
Requirements on Corpus
--------------------------
As mentioned above, you can provide your own training corpus for Espressif. Below are the requirements.
#. Audio file format
- Sample rate: 16 kHz
- Encoding: 16-bit signed int
- Channel: mono
- Format: WAV
#. Sampling environment
- Room with an ambient noise lower than 30 dB and reverberation less than 0.3 s, or a professional audio room (recommended).
- Recording device: high-fidelity microphone.
- The whole product is strongly recommended.
- The development board of your product also works when there is no cavity structure.
- Record in 16 kHz, and don't use **resampling**.
- At the recording site, pay attention to the impact of reverberation interference in a closed environment.
- Collect samples with multiple recording devices at the same time (recommended).
- For example, postion the devices at 1 m and 3 m away.
- So more samples are collected with the same number of time and participants.
#. Sample distribution
- Sample size: 500. Males and females should be close to 1:1.
- The number of children under 12 years old invloved varies from product to product, but the percentage should be no less than 15%.
- If there are requirements for certain languages or dialects, special corpus samples need to be provided.
- It is recommended to name the samples according to the age, gender, and quantity of the collected samples, such as HiLeXin_male_B_014.wav, and ABCD represents different age groups.
Hareware Design Guidelines
---------------------------
#. Please refer to *Espressif MIC Design Guidelines*.

View File

@ -0,0 +1,102 @@
wakeNet
========
:link_to_translation:`zh_CN:[中文]`
wakeNet, which is a wake word engine built upon neural network, is specially designed for low-power embedded MCUs. Now, the wakeNet model supports up to 5 wake words.
Overview
--------
Please see the flow diagram of wakeNet below:
.. figure:: ../../_static/wakenet_workflow.png
:alt: overview
.. raw:: html
<center>
.. raw:: html
</center>
- speech features:
We use the `MFCC <https://en.wikipedia.org/wiki/Mel-frequency_cepstrum>`__ method to extract speech spectrum features. The sampling rate of the input audio file is 16KHz, mono, and the encoding mode is signed 16-bit. The window width and step size of each frame are 30ms.
We use `MFCC <https://en.wikipedia.org/wiki/Mel-frequency_cepstrum>`__ method to extract the speech spectrum features. The input audio file has a sample rate of 16KHz, mono, and is encoded as signed 16-bit. each frame has a window width and step size of 30ms.
- Speech Feature:
The wakeNet uses `MFCC <https://en.wikipedia.org/wiki/Mel-frequency_cepstrum>`__ to obtain the features of the input audio clip (16 KHz, 16 bit, single track). The window width and step width of each frame of the audio clip are both 30 ms.
- Neural Network:
Now, the neural network structure has been updated to the ninth edition, among which:
- wakeNet1,wakeNet2,wakeNet3,wakeNet4,wakeNet6,wakeNet7 had been out of use.
- wakeNet5 only support ESP32 chip.
- wakeNet8,wakeNet9 only support ESP32S3 chip, which are built upon the `Dilated Convolution <https://arxiv.org/pdf/1609.03499.pdf>`__ structure.
.. note:: text
The network structure of wakeNet5,wakeNet5X2 and wakeNet5X3 is same, but the parameter of wakeNetX2 and wakeNetX3 is more than wakeNet5. Please refer to `Performance Test <#performance-test>`__ for details.
- Keyword Triggering Method:
For continuous audio stream, we calculate the average recognition results (M) for several frames and generate a smoothing prediction result, to improve the accuracy of keyword triggering. Only when the M value is larger than the set threshold, a triggering command is sent.
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| Chip | ESP32 | ESP32S3 |
+=================+===========+=============+=============+===========+===========+===========+===========+
| model | WakeNet 5 | WakeNet 8 | WakeNet 9 |
| +-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| | WakeNet 5 | WakeNet 5X2 | WakeNet 5X3 | Q16 | Q8 | Q16 | Q8 |
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| Hi,Lexin | √ | √ | √ | | | | √ |
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| nihaoxiaozhi | √ | | √ | | | | √ |
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| nihaoxiaoxin | | | √ | | | | |
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| xiaoaitongxue | | | | | | | √ |
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| Alexa | | | | √ | | | √ |
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| Hi,ESP | | | | | | | √ |
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| Customized word | | | | | | | √ |
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
Use wakeNet
-----------
- How to select the wakeNet model
Please refer to `flash model <../flash_model/README.rst>`__.
- How to run wakeNet
wakeNet is currently included in the `AFE <../audio_front_end/README.rst>`__, which is running by default, and returns the detect results through the AFE fetch interface.
If users do not wants to initialize WakeNet, please use:
::
afe_config.wakeNet_init = False.
If users want to close/open WakeNet temporarily, plese use:
::
afe_handle->disable_wakenet(afe_data)
afe_handle->enable_wakenet(afe_data)
Performance Test
----------------
Please refer to `Performance Test <../performance_test/README.rst>`__.
Wake Word Customization
-----------------------
For details on how to customize your wake words, please see `Espressif Speech Wake Word Customization Process <ESP_Wake_Words_Customization.rst>`__.

View File

@ -1,162 +0,0 @@
# Model loading method[[中文]](./README_CN.md)
In esp-sr, both WakeNet and MultiNet will use a large amount of model data, and the model data is located in `ESP-SR_PATH/model/`.
Currently esp-sr supports the following model loading methods:
ESP32:
- Load directly from Flash
ESP32S3:
- Load from Flash spiffs partition
- Load from external SDCard
So that on ESP32S3 you can:
- Greatly reduce the size of the user application APP BIN
- Supports the selection of up to two wake words
- Support online switching of Chinese and English Speech Command Recognition
- Convenient for users to perform OTA
- Supports reading and changing models from SD card, which is more convenient and can reduce the size of module Flash used in the project
- When the user is developing the code, when the modification does not involve the model, it can avoid flashing the model data every time, greatly reducing the flashing time and improving the development efficiency
## 1. Model configuration introduction
Run `idf.py menuconfig` navigate to `ESP Speech Recognition`:
![overview](../img/model-1.png)
### 1.1 model data path
This option is only available on ESP32S3. It indicates the storage location of the model data. It supports the choice of `spiffs partition` or `SD Card`.
- `spiffs partition` means that the model data is stored in the Flash spiffs partition, and the model data will be loaded from the Flash spiffs partition
- `SD Card` means that the model data is stored in the SD card, and the model data will be loaded from the SD Card
### 1.2 use afe
This option needs to be turned on. Users do not need to modify it. Please keep the default configuration.
### 1.3 use wakenet
This option is turned on by default. When the user only uses `AEC` or `BSS`, etc., and does not need to run `WakeNet` or `MultiNet`, please turn off this option, which will reduce the size of the project firmware.
- Select wake words by menuconfig, `ESP Speech Recognition -> Select wake words`. The model name of wake word in parentheses is used to initialize wakenet handle.
![select wake wake](../img/wn_menu1.png)
- If you want to select multiple wake words, please select `Load Multiple Wake Words` ( **Note this option only supports ESP32S3**)
![multi wake wake](../img/wn_menu2.png)
Then you can select multiple wake words at the same time
![multi wake wake](../img/wn_menu3.png)
For more details, please refer to [WakeNet](../wake_word_engine/README.md) .
### 1.4 use multinet
This option is turned on by default. When users only use WakeNet or other algorithm modules, please turn off this option, which will reduce the size of the project firmware in some cases.
ESP32 chip only supports Chinese Speech Commands Recognition.
ESP32S3 supports Chinese and English Speech Commands Recognition, and supports Chinese and English recognition model switching.
- Chinese Speech Commands Model
Chinese Speech Commands Recognition model selection.
ESP32 supports:
- None
- chinese single recognition (MultiNet2)
ESP32S3 支持:
- None
- chinese single recognition (MultiNet4.5)
- chinese single recognition (MultiNet4.5 quantized with 8-bit)
- English Speech Commands Model
English Speech Commands Recognition model selection.
This option does not support ESP32.
ESP32S3 Supports
- None
- english recognition (MultiNet5 quantized with 8-bit, depends on WakeNet8)
- Add Chinese speech commands
The user needs to add Chinese Speech Command words to this item when `Chinese Speech Commands Model` is not `None`.
- Add English speech commands
The user needs to add English Speech Command words to this item when `Chinese Speech Commands Model` is not `None`.
For more details, please refer to [MultiNet](../speech_command_recognition/README.md) .
## 2. How to use
Here is an introduction to the code implementation of model data loading in the project. If you want get more detailes, please refer to esp-skainet examples.
### 2.1.1 ESP32
When the user uses ESP32, since it only supports loading the model data directly from the Flash, the model data in the code will automatically read the required data from the Flash according to the address.
Now The ESP32S3 API is compatible with ESP32. You can refer to the ESP32S3 method to load and initialize the model.
### 2.1.2 ESP32S3
- Step1: Write a partition table:
```
model, data, spiffs, , SIZE,
```
Among them, `SIZE` can refer to the recommended size when the user uses 'idf.py build' to compile, for example:
```
Recommended model partition size: 500K
```
After completing the above configuration, the project will automatically generate `model.bin` after the project is compiled, and flash it to the spiffs partition.
- Step2: Initialize the spiffs partition
User can use `esp_srmodel_init()` API to initialize spiffs and return all loaded models.
- base_path: The model storage `base_path` is `srmodel` and cannot be changed
- partition_label: The partition label of the model is `model`, which needs to be consistent with the `Name` in the above partition table
**<font color=red>Note: After the user changes the model, be sure to run `idf.py clean` before compiling again.</font>**
## 2.2 ESP32S3
```
//
// step1: initialize spiffs and return models in spiffs
//
srmodel_list_t *models = esp_srmodel_init("model");
//
// step2: select the specific model by keywords
//
char *wn_name = esp_srmodel_filter(models, ESP_WN_PREFIX, NULL); // select wakenet model
char *nm_name = esp_srmodel_filter(models, ESP_MN_PREFIX, NULL); // select multinet model
char *alexa_wn_name = esp_srmodel_filter(models, ESP_WN_PREFIX, "alexa"); // select wakenet with "alexa" wake word.
char *en_mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_ENGLISH); // select english multinet model
char *cn_mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_CHINESE); // select english multinet model
// It also works if you use the model name directly in your code.
char *my_wn_name = "wn9_hilexin"
// we recommend you to check that it is loaded correctly
if (!esp_srmodel_exists(models, my_wn_name))
printf("%s can not be loaded correctly\n")
//
// step3: initialize model
//
esp_wn_iface_t *wakenet = esp_wn_handle_from_name(wn_name);
model_iface_data_t *wn_model_data = wakenet->create(wn_name, DET_MODE_2CH_90);
esp_mn_iface_t *multinet = esp_mn_handle_from_name(mn_name);
model_iface_data_t *mn_model_data = multinet->create(mn_name, 6000);
```

View File

@ -1,195 +0,0 @@
# 模型加载方式[[English]](./README.md)
在 esp-sr 中WakeNet 和 MultiNet 均会使用到大量的模型数据,模型数据位于 `ESP-SR_PATH/model/` 中。
目前 esp-sr 支持以下模型加载方式:
ESP32
- 从 Flash 中直接加载
ESP32S3
- 从 Flash spiffs 分区加载
- 从外部 SDCard 加载
从而在 ESP32S3 上可以:
- 大大减小用户应用 APP BIN 的大小
- 支持选择最多两个唤醒词
- 支持中文和英文命令词识别在线切换
- 方便用户进行 OTA
- 支持从 SD 卡读取和更换模型,更加便捷且可以缩减项目使用的模组 Flash 大小
- 当用户进行开发时,当修改不涉及模型时,可以避免每次烧录模型数据,大大缩减烧录时间,提高开发效率
## 1. 模型配置介绍
运行 `idf.py menuconfig` 进入 `ESP Speech Recognition`:
![overview](../img/model-1.png)
### 1.1 model data path
该选项只在 ESP32S3 上可用,表示模型数据的存储位置,支持选择 `spiffs partition``SD Card`
- `spiffs partition` 表示模型数据存储在 Flash spiffs 分区中,模型数据将会从 Flash spiffs 分区中加载
- `SD Card` 表示模型数据存储在 SD 卡中,模型数据将会从 SD Card 中加载
### 1.2 use afe
该选项需要打开,用户无须修改,请保持默认配置。
### 1.3 use wakenet
此选项默认打开,当用户只使用 AEC 或者 BSS 等,无须运行 WakeNet 或 MultiNet 时,请关闭次选项,将会减小工程固件的大小。
- 根据menuconfig列表选择唤醒词模型`ESP Speech Recognition -> Select wake words`. 括号中为唤醒词模型的名字你需要在代码用名字切换初始化wakenet.
![select wake wake](../img/wn_menu1.png)
- 如果想加载多个唤醒词,以便在代码中进行唤醒词的切换,首选选择'Load Multiple Wake Words'
![multi wake wake](../img/wn_menu2.png)
然后按照列表选择多个唤醒词:
![multi wake wake](../img/wn_menu3.png)
**注:多唤醒词选项只支持 ESP32S3具体根据客户硬件flash容量选择合适数量的唤醒词。**
更多细节请参考 [WakeNet](../wake_word_engine/README.md) 。
### 1.4 use multinet
此选项默认打开。当用户只使用 WakeNet 或者其他算法模块时,请关闭此选项,将会在一些情况下减小工程固件的大小。
ESP32 芯片只支持中文命令词识别。ESP32S3 支持中文和英文命令词识别,且支持中英文识别模型切换。
- Chinese Speech Commands Model
中文命令词识别模型选择。
ESP32 支持:
- None
- chinese single recognition (MultiNet2)
ESP32S3 支持:
- None
- chinese single recognition (MultiNet4.5)
- chinese single recognition (MultiNet4.5 quantized with 8-bit)
- English Speech Commands Model
英文命令词识别模型选择。
该选项不支持 ESP32。
ESP32S3 支持:
- None
- english recognition (MultiNet5 quantized with 8-bit, depends on WakeNet8)
- Add Chinese speech commands
当用户在 `Chinese Speech Commands Model` 中选择非 `None` 时,需要在该项处添加中文命令词。
- Add English speech commands
当用户在 `English Speech Commands Model` 中选择非 `None` 时,需要在该项处添加中文命令词。
用户按照需求自定义添加命令词,具体请参考 [MultiNet](../speech_command_recognition/README.md) 。
## 2. 模型使用
当用户完成以上的配置选择后,应用层请参考 esp-skainet 进行初始化和使用。这里介绍一下模型数据加载在用户工程中的代码实现。
也可以参考代码 [model_path.c](../../src/model_path.c)
### 2.1 使用 ESP32
当用户使用 ESP32 时,由于只支持从 Flash 中直接加载模型数据,因此代码中模型数据会自动按照地址从 Flash 中读取所需数据。
为了和ESP32S3进行兼容代码中模型的初始化方法是和ESP32S3相同的可参考下面ESP32S3的模型加载API
### 2.2 使用 ESP32S3
#### 2.2.1 模型数据存储在 SPIFFS
- 编写分区表:
```
model, data, spiffs, , SIZE,
```
其中 SIZE 可以参考在用户使用 'idf.py build' 编译时的推荐大小,例如:
```
Recommended model partition size: 500K
```
- 初始化 spiffs 分区
**调用提供的 API**:用户可以直接调用 `esp_srmodel_init()` API 来初始化 spiffs并返回spiffs中的模型。
- base_path模型的存储 `base_path``srmodel`,不可更改
- partition_label模型的分区 label 为 `model`,需要和 上述分区表中的 `Name` 保持一致
完成上述配置后,模型会在工程编译完成后自动生成 `model.bin`,并在用户调用`idf.py flash`时烧写到 spiffs 分区。
#### 2.2. 模型存储在 SD Card
当用户配置 #1.2 模型数据存储位置是 `SD Card` 时,用户需要:
- 手动移动模型数据
将模型移动到 SDCard 中,用户完成以上配置后,可以先进行编译,编译完成后将 `ESP-SR_PATH/model/target/` 目录下的文件拷贝至 SD 卡的根目录。
- 自定义路径
如果用户想将模型放置于指定文件夹,可以自己修改 `get_model_base_path()` 函数,位于 `ESP-SR_PATH/model/model_path.c`
比如,指定文件夹为 SD 卡目录中的 `espmodel`, 则可以修改该函数为:
```
char *get_model_base_path(void)
{
#if defined CONFIG_MODEL_IN_SDCARD
return "sdcard/espmodel";
#elif defined CONFIG_MODEL_IN_SPIFFS
return "srmodel";
#else
return NULL;
#endif
}
```
- 初始化 SD 卡
用户需要初始化 SD 卡,来使系统能够记载 SD 卡,如果用户使用 esp-skainet可以直接调用 `esp_sdcard_init("/sdcard", num);` 来初始化其支持开发板的 SD 卡。否则,需要自己编写。
完成以上操作后,便可以进行工程的烧录。
#### 2.2. 代码中模型初始化与使用
```
//
// step1: initialize spiffs and return models in spiffs
//
srmodel_list_t *models = esp_srmodel_init();
//
// step2: select the specific model by keywords
//
char *wn_name = esp_srmodel_filter(models, ESP_WN_PREFIX, NULL); // select wakenet model
char *nm_name = esp_srmodel_filter(models, ESP_MN_PREFIX, NULL); // select multinet model
char *alexa_wn_name = esp_srmodel_filter(models, ESP_WN_PREFIX, "alexa"); // select wakenet with "alexa" wake word.
char *en_mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_ENGLISH); // select english multinet model
char *cn_mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_CHINESE); // select english multinet model
// It also works if you use the model name directly in your code.
char *my_wn_name = "wn9_hilexin"
// we recommend you to check that it is loaded correctly
if (!esp_srmodel_exists(models, my_wn_name))
printf("%s can not be loaded correctly\n")
//
// step3: initialize model
//
esp_wn_iface_t *wakenet = esp_wn_handle_from_name(wn_name);
model_iface_data_t *wn_model_data = wakenet->create(wn_name, DET_MODE_2CH_90);
esp_mn_iface_t *multinet = esp_mn_handle_from_name(mn_name);
model_iface_data_t *mn_model_data = multinet->create(mn_name, 6000);
```

216
docs/generate_chart.py Normal file
View File

@ -0,0 +1,216 @@
#!/usr/bin/env python
# Copyright 2020 Espressif Systems (Shanghai) PTE LTD
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import argparse
import datetime as dt
import json
import matplotlib.dates
import matplotlib.patches as mpatches
import matplotlib.pyplot as plt
import numpy as np
import requests
from dateutil import parser
from dateutil.relativedelta import relativedelta
from matplotlib.dates import MONTHLY, DateFormatter, RRuleLocator, rrulewrapper
class Version(object):
def __init__(self, version_name, explicit_start_date, explicit_end_date, explicit_end_service_date=None):
self.version_name = version_name
self._start_date = parser.parse(explicit_start_date)
self._end_of_life_date = parser.parse(explicit_end_date)
self._end_service_date = parser.parse(
explicit_end_service_date) if explicit_end_service_date is not None else self.compute_end_service_date()
self.start_date_matplotlib_format = matplotlib.dates.date2num(self._start_date)
self.end_of_life_date_matplotlib_format = matplotlib.dates.date2num(self._end_of_life_date)
self.end_service_date_matplotlib_format = matplotlib.dates.date2num(self._end_service_date)
@staticmethod
def add_months(source_date, months):
return source_date + relativedelta(months=+months)
def get_start_date(self):
return self._start_date
def get_end_of_life_date(self):
return self._end_of_life_date
def get_end_service_date(self):
return self._end_service_date
def compute_end_service_date(self):
return self.add_months(self._start_date, 12)
class ChartVersions(object):
def __init__(self, url=None, filename=None):
self._releases = self._get_releases_from_url(url=url, filename=filename)
self.sorted_releases_supported = sorted(self.filter_old_versions(self._releases), key=lambda x: x.version_name,
reverse=True)
def get_releases_as_json(self):
return {
x.version_name: {
'start_date': x.get_start_date().strftime('%Y-%m-%d'),
'end_service': x.get_end_service_date().strftime('%Y-%m-%d'),
'end_date': x.get_end_of_life_date().strftime('%Y-%m-%d')
} for x in self.sorted_releases_supported
}
@staticmethod
def parse_chart_releases_from_js(js_as_string):
return json.loads(js_as_string[js_as_string.find('RELEASES: ') + len('RELEASES: '):js_as_string.rfind('};')])
def _get_all_version_from_url(self, url=None, filename=None):
releases_file = requests.get(url).text if url is not None else ''.join(open(filename).readlines())
return self.parse_chart_releases_from_js(releases_file)
def _get_releases_from_url(self, url=None, filename=None):
all_versions = self._get_all_version_from_url(url, filename)
return [
Version(version_name=x,
explicit_start_date=all_versions[x]['start_date'],
explicit_end_date=all_versions[x]['end_date'] if 'end_date' in all_versions[x].keys() else None,
explicit_end_service_date=all_versions[x]['end_service'] if 'end_service' in all_versions[
x].keys() else None)
for x in all_versions.keys()
]
@staticmethod
def filter_old_versions(versions):
return list(
filter(lambda x: x.get_end_of_life_date() >= dt.datetime.now(x.get_end_of_life_date().tzinfo), versions))
@staticmethod
def months_timedelta(datetime_1, datetime2):
datetime_1, datetime2 = (datetime2, datetime_1) if datetime_1 > datetime2 else (datetime_1, datetime2)
return (datetime2.year * 12 + datetime2.month) - (datetime_1.year * 12 + datetime_1.month)
@staticmethod
def find_next_multiple_of_power_two(number, initial=3):
"""
Computes the next multiple of the number by some power of two.
>>> ChartVersions.find_next_multiple_of_power_two(7, 3)
12
"""
msb = number.bit_length()
return 3 if number <= 1 else initial << msb - 2 << (1 & number >> msb - 2)
def find_nearest_multiple_of_power_two(self, number, initial=3, prefer_next=False):
next_num = self.find_next_multiple_of_power_two(number=number - 1, initial=initial)
previous_num = next_num >> 1
return next_num if abs(next_num - number) < (abs(previous_num - number) + int(prefer_next)) else previous_num
def create_chart(self,
figure_size=(41.8330013267, 16.7332005307),
subplot=111,
step_size=0.5,
bar_height=0.3,
version_alpha=0.8,
lts_service_color='darkred',
lts_maintenance_color='red',
bar_align='center',
date_interval=None,
output_chart_name='docs/chart',
output_chart_extension='.png',
months_surrounding_chart=4,
service_period_label='Service period (Recommended for new designs)',
maintenance_period_text='Maintenance period'):
fig = plt.figure(figsize=figure_size)
ax = fig.add_subplot(subplot)
labels_count = len(self.sorted_releases_supported)
pos = np.arange(step_size, labels_count * step_size + step_size, step_size)
for release, i in zip(self.sorted_releases_supported, range(labels_count)):
start_date = release.start_date_matplotlib_format
end_of_service_date = release.end_service_date_matplotlib_format
end_date = release.end_of_life_date_matplotlib_format
ax.barh((i * step_size) + step_size, (end_of_service_date or end_date) - start_date, left=start_date,
height=bar_height, align=bar_align,
color=lts_service_color,
alpha=version_alpha,
edgecolor=lts_service_color)
if end_of_service_date is not None:
ax.barh((i * step_size) + step_size, end_date - end_of_service_date, left=end_of_service_date,
height=bar_height, align=bar_align,
color=lts_maintenance_color, alpha=version_alpha, edgecolor=lts_maintenance_color)
ax.set_ylim(bottom=0, ymax=labels_count * step_size + step_size)
max_ax_date = Version.add_months(
max(self.sorted_releases_supported,
key=lambda version: version.get_end_of_life_date().replace(tzinfo=None)).get_end_of_life_date(),
months_surrounding_chart + 1).replace(day=1)
min_ax_date = Version.add_months(
min(self.sorted_releases_supported,
key=lambda version: version.get_start_date().replace(tzinfo=None)).get_start_date(),
-months_surrounding_chart).replace(day=1)
x_ax_interval = date_interval or self.find_nearest_multiple_of_power_two(
self.months_timedelta(max_ax_date, min_ax_date) // 10)
ax.set_xlim(xmin=min_ax_date, xmax=max_ax_date)
ax.grid(color='g', linestyle=':')
ax.xaxis_date()
rule = rrulewrapper(MONTHLY, interval=x_ax_interval)
loc = RRuleLocator(rule)
formatter = DateFormatter('%b %Y')
ax.xaxis.set_major_locator(loc)
ax.xaxis.set_major_formatter(formatter)
x_labels = ax.get_xticklabels()
plt.ylabel('ESP-IDF Release', size=12)
ax.invert_yaxis()
fig.autofmt_xdate()
darkred_patch = mpatches.Patch(color=lts_service_color, label=service_period_label)
red_patch = mpatches.Patch(color=lts_maintenance_color, label=maintenance_period_text)
plt.setp(plt.yticks(pos, map(lambda x: x.version_name, self.sorted_releases_supported))[1], rotation=0,
fontsize=10, family='Tahoma')
plt.setp(x_labels, rotation=30, fontsize=11, family='Tahoma')
plt.legend(handles=[darkred_patch, red_patch], prop={'size': 10, 'family': 'Tahoma'},
bbox_to_anchor=(1.01, 1.165), loc='upper right')
fig.set_size_inches(11, 5, forward=True)
plt.savefig(output_chart_name + output_chart_extension, bbox_inches='tight')
print('Saved into ' + output_chart_name + output_chart_extension)
if __name__ == '__main__':
arg_parser = argparse.ArgumentParser(
description='Create chart of version support. Set the url or filename with versions.'
'If you set both filename and url the script will prefer filename.')
arg_parser.add_argument('--url', metavar='URL', default='https://dl.espressif.com/dl/esp-idf/idf_versions.js')
arg_parser.add_argument('--filename',
help='Set the name of the source file, if is set, the script ignores the url.')
arg_parser.add_argument('--output-format', help='Set the output format of the image.', default='svg')
arg_parser.add_argument('--output-file', help='Set the name of the output file.', default='docs/chart')
args = arg_parser.parse_args()
ChartVersions(url=args.url if args.filename is None else None, filename=args.filename).create_chart(
output_chart_extension='.' + args.output_format.lower()[-3:], output_chart_name=args.output_file)

View File

@ -0,0 +1,7 @@
.. This document contains all the inline substitutions in the ESP-AT repo.
.. |icon-green-check| image:: ../../_static/icon-green-check.png
.. |icon-orange-check| image:: ../../_static/icon-orange-check.png
.. |icon-red-cross| image:: ../../_static/icon-red-cross.png

1
docs/myapp/bin/python Symbolic link
View File

@ -0,0 +1 @@
python3

1
docs/myapp/bin/python3 Symbolic link
View File

@ -0,0 +1 @@
/usr/bin/python3

1
docs/myapp/lib64 Symbolic link
View File

@ -0,0 +1 @@
lib

3
docs/myapp/pyvenv.cfg Normal file
View File

@ -0,0 +1,3 @@
home = /usr/bin
include-system-site-packages = false
version = 3.7.4

9
docs/page_redirects.txt Executable file
View File

@ -0,0 +1,9 @@
# Redirects from "old URL" "new URL"
#
# Space delimited
#
# New URL should be relative to document root, only)
#
# Empty lines and lines starting with # are ignored
# get-started/ESP_AT_Commands_Set AT_Command_Set/AT_Command_Set

View File

@ -1,75 +0,0 @@
# Performance Test
## 1. AFE
### 1.1 Resource Occupancy(ESP32)
|algorithm Type|RAM|Average cpu loading(compute with 2 cores)| Frame Length|
|:---:|:---:|:---:|:---:|
|AEC(HIGH_PERF)|114 KB|11%|32 ms|
|NS|27 KB|5%|10 ms|
|AFE Layer|73 KB| | |
### 1.2 Resource Occupancy(ESP32S3)
|algorithm Type|RAM|Average cpu loading(compute with 2 cores)| Frame Length|
|:---:|:---:|:---:|:---:|
|AEC(LOW_COST)|152.3 KB|8%|32 ms|
|AEC(HIGH_PERF)|166 KB|11%|32 ms|
|BSS(LOW_COST)|198.7 KB|6%|64 ms|
|BSS(HIGH_PERF)|215.5 KB|7%|64 ms|
|NS|27 KB|5%|10 ms|
|MISO|56 KB|8%|16 ms|
|AFE Layer|227 KB| | |
## 2. WakeNet
### 2.1 Resource Occupancy(ESP32)
|Model Type|Parameter Num|RAM|Average Running Time per Frame| Frame Length|
|:---:|:---:|:---:|:---:|:---:|
|Quantised WakeNet5|41 K|15 KB|5.5 ms|30 ms|
|Quantised WakeNet5X2|165 K|20 KB|10.5 ms|30 ms|
|Quantised WakeNet5X3|371 K|24 KB|18 ms|30 ms|
### 2.2 Resource Occupancy(ESP32S3)
|Model Type|RAM|PSRAM|Average Running Time per Frame| Frame Length|
|:---:|:---:|:---:|:---:|:---:|
|Quantised WakeNet8 @ 2 channel|50 KB|1640 KB|10.0 ms|32 ms|
|Quantised WakeNet9 @ 2 channel|16 KB|324 KB|3.0 ms|32 ms|
|Quantised WakeNet9 @ 3 channel|20 KB|347 KB|4.3 ms|32 ms|
### 2.3 Performance
|Distance| Quiet | Stationary Noise (SNR = 4 dB)| Speech Noise (SNR = 4 dB)| AEC Interruption (-10 dB)|
|:---:|:---:|:---:|:---:|:---:|
|1 m|98%|96%|94%|96%|
|3 m|98%|96%|94%|94%|
False triggering rate: 1 time in 12 hours
**Note**: We use the ESP32-S3-Korvo V4.0 development board and the WakeNet9(Alexa) model in our test.
## 3. MultiNet
### 2.1 Resource Occupancy(ESP32)
|Model Type|Internal RAM|PSRAM|Average Running Time per Frame| Frame Length|
|:---:|:---:|:---:|:---:|:---:|
|MultiNet 2|13.3 KB|9KB|38 ms|30 ms|
### 2.2 Resource Occupancy(ESP32S3)
|Model Type|Internal RAM|PSRAM|Average Running Time per Frame| Frame Length|
|:---:|:---:|:---:|:---:|:---:|
|MultiNet 4|16.8KB|1866 KB|18 ms|32 ms|
|MultiNet 4 Q8|10.5 KB|1009 KB|11 ms|32 ms|
|MultiNet 5 Q8|16 KB |2310 KB|12 ms|32 ms|
### 2.3 Performance with AFE
|Model Type|Distance| Quiet | Stationary Noise (SNR = 4 dB)| Speech Noise (SNR = 4 dB)|
|:---:|:---:|:---:|:---:|:---:|:---:|
|MultiNet 4|3 m|98%|93%|92%|
|MultiNet 4 Q8|3 m|94%|92%|91%|

1
docs/requirements.txt Executable file
View File

@ -0,0 +1 @@
esp-docs==1.3.0

View File

@ -1,201 +0,0 @@
# MultiNet Introduction
MultiNet is a lightweight model designed to realize speech commands recognition offline on ESP32 series. Now, up to 200 speech commands, including customized commands, are supported.
> Support Chinese and English speech commands recognition (esp32s3 is required for English speech commands recognition)
> Support user-defined commands
> Support adding / deleting / modifying commands during operation
> Up to 200 commands are supported
> It supports single recognition and continuous recognition
> Lightweight and low resource consumption
> Low delay, within 500ms
> Support online Chinese and English model switching (esp32s3 only)
> The model is partitioned separately to support users to apply OTA
## 1. Overview
The MultiNet input is the audio processed by the audio-front-end algorithm (AFE), with the format of 16KHz, 16bit and mono. By recognizing the audio, you can correspond to the corresponding Chinese characters or English words.
The following table shows the model support of Espressif SoCs:
![multinet_model](../img/MultiNet_model.png)
Note: the model ending with Q8 represents the 8bit version of the model, means more lightweight.
## 2. Commands Recognition Process
Please see the flow diagram below:
![speech_command-recognition-system](../img/multinet_workflow.png)
## 3. User Guide
### 3.1 Requirements of speech commands
- The recommended length of Chinese is generally 4-6 Chinese characters. Too short leads to high false recognition rate and too long is inconvenient for users to remember
- The recommended length of English is generally 4-6 words
- Mixed Chinese and English is not supported in command words
- Currently, up to 200 command words are supported
- The command word cannot contain Arabic numerals and special characters
- Avoid common command words like "hello"
- The greater the pronunciation difference of each Chinese character / word in the command word, the better the performance
### 3.2 Speech commands customization method
> Support a variety of speech commands customization methods
> Support dynamic addition / deletion / modification of speech commands
#### 3.2.1 Format of Speech commands
Speech commands string need to meet specific formats, as follows:
- Chines
Chinese speech commands need to use Chinese Pinyin, and there should be a space between the Pinyin spelling of each word. For example, "打开空调" should be written as "da kai kong tiao", "打开绿色灯" should be written as "da kai lv se deng".
In addition, we also provide corresponding tools for users to convert Chinese characters into pinyin. See details:
- English
English speech commands need to be represented by specific phonetic symbols. The phonetic symbols of each word are separated by spaces, such as "turn on the light", which needs to be written as "TkN nN jc LiT".
**We provide specific conversion rules and tools. For details, please refer to the English G2P [tool](../../tool/multinet_g2p.py).**
#### 3.2.2 Set speech commands offline
Multinet supports flexible speech commands setting methods. No matter which way users set speech commands (code / network / file), they only need to call the corresponding API.
Here we provide two methods of adding speech commands:
- Use `menuconfig`
Users can refer to the example in ESP-Skainet, users can define their own speech commands by `idf.py menuconfig -> ESP Speech Recognition-> Add Chinese speech commands/Add English speech commands`.
![menuconfig_add_speech_commands](../img/menuconfig_add_speech_commands.png)
Please note that a single `Command ID` can support multiple phrases. For example, "da kai kong tiao" and "kai kong tiao" have the same meaning, you can write them in the entry corresponding to the same command ID, and separate the adjacent entries with the English character "," without spaces before and after ",".
Then call the following API:
```
/**
* @brief Update the speech commands of MultiNet by menuconfig
*
* @param multinet The multinet handle
*
* @param model_data The model object to query
*
* @param langugae The language of MultiNet
*
* @return
* - ESP_OK Success
* - ESP_ERR_INVALID_STATE Fail
*/
esp_err_t esp_mn_commands_update_from_sdkconfig(esp_mn_iface_t *multinet, const model_iface_data_t *model_data);
```
- Add speech commands in the code
Users can refer to example in ESP-Skainet for this method of adding speech commands.
In this method, users directly set the speech command words in the code and transmits them to multinet. In the actual development and products, the user can transmit the required speech commands through various possible ways such as network / UART / SPI and change the speech commands.
#### 3.2.3 Set speech commands online
MultiNet supports online dynamic addition / deletion / modification of speech commands during operation, without changing models or adjusting parameters. For details, please refer to the example in ESP-Skainet.
Please refer to [esp_mn_speech_commands](../../src/esp_mn_speech_commands.c) for details of APIs:
## 4. Run speech commands recognition
Speech commands recognition needs to be run together with the audio front-end (AFE) in esp-sr (WakeNet needs to be enabled in AFE). For the use of AFE, please refer to the document:
[AFE 介绍及使用](../audio_front_end/README_CN.md)
### 4.1 MultiNet Initialization
- Initialize multinet model
- Set speech commands
Please refer #3.
### 4.2 Run MultiNet
When users uses AFE and enables wakenet, then can use MultiNet. And there are the following requirements:
> The frame length of MultiNet is equal to the AFE fetch frame length
> The audio format supported is 16KHz, 16bit, mono. The data obtained by AFE fetch is also in this format
- Get the frame length that needs to be passed into MultiNet
```
int mu_chunksize = multinet->get_samp_chunksize(model_data);
```
- MultiNet detect
We send the data from AFE fetch to the following API:
```
esp_mn_state_t mn_state = multinet->detect(model_data, buff);
```
The lengthof `buff` is `mu_chunksize * sizeof(int16_t)`.
### 4.3 The detect result of MultiNet
Speech commands recognition supports two basic modes:
> Single recognition
> Continuous recognition
Speech command recognition must be used with WakeNet. After wake-up, MultiNet detection can be run.
When the MultiNet is running, it will return the recognition status of the current frame in real time `mn_state`, which is currently divided into the following identification states:
- ESP_MN_STATE_DETECTING
This status indicates that the MultiNet is detecting but target speech command word has not been recognized.
- ESP_MN_STATE_DETECTED
This status indicates that the target speech command has been recognized. At this time, the user can call `get_results` interface obtains the identification results.
```
esp_mn_results_t *mn_result = multinet->get_results(model_data);
```
The information identifying the result is stored in the return value of the `get_result` API, the data type of the return value is as follows:
```
typedef struct{
esp_mn_state_t state;
int num; // The number of phrase in list, num<=5. When num=0, no phrase is recognized.
int phrase_id[ESP_MN_RESULT_MAX_NUM]; // The list of phrase id.
float prob[ESP_MN_RESULT_MAX_NUM]; // The list of probability.
} esp_mn_results_t;
```
- `state` is the recognition status of the current frame
- `num` means the number of recognized commands, `num` <= 5, up to 5 possible results are returned
- `phrase_id` means the Phrase ID of speech commands
- `prob` meaNS the recognition probability of the recognized entries, which is arranged from large to small
Users can use `phrase_id[0]` and `prob[0]` get the recognition result with the highest probability.
- ESP_MN_STATE_TIMEOUT
This status means that the speech commands has not been detected for a long time and will exit automatically. Wait for the next wake-up.
Therefore:
Exit the speech recognition when the return status is `ESP_MN_STATE_DETECTED`, it is single recognition mode;
Exit the speech recognition when the return status is `ESP_MN_STATE_TIMEOUT`, it is continuous recognition mode;
## 5. Other configurations
### 5.1 Threshold setting
This function is still under development.

View File

@ -1,211 +0,0 @@
# MultiNet 介绍 [[English]](./README.md)
MultiNet 是为了在 ESP32 系列上离线实现多命令词识别而设计的轻量化模型,目前支持 200 个以内的自定义命令词识别。
> 支持中文和英文命令词识别(英文命令词识别需使用 ESP32S3
> 支持用户自定义命令词
> 支持运行过程中 增加/删除/修改 命令词语
> 最多支持 200 个命令词
> 支持单次识别和连续识别两种模式
> 轻量化,低资源消耗
> 低延时延时500ms内
> 支持在线中英文模型切换(仅 ESP32S3
> 模型单独分区,支持用户应用 OTA
## 1. 概述
MultiNet 输入为经过前端语音算法AFE处理过的音频格式为 16KHz16bit单声道。通过对音频进行识别则可以对应到相应的汉字或单词。
以下表格展示在不同芯片上的模型支持:
![multinet_model](../img/MultiNet_model.png)
用户选择不同的模型的方法请参考 [flash model](../flash_model/README_CN.md) 。
**注:其中以 `Q8` 结尾的模型代表模型的 8bit 版本,表明该模型更加轻量化。**
## 2. 命令词识别原理
可以参考以下命令词识别原理:
![speech_command-recognition-system](../img/multinet_workflow.png)
## 3. 使用指南
### 3.1 命令词设计要求
- 中文推荐长度一般为 4-6 个汉字,过短导致误识别率高,过长不方便用户记忆
- 英文推荐长度一般为 4-6 个单词
- 命令词中不支持中英文混合
- 目前最多支持 **200** 条命令词
- 命令词中不能含有阿拉伯数字和特殊字符
- 命令词避免使用常用语
- 命令词中每个汉字/单词的发音相差越大越好
### 3.2 命令词自定义方法
> 支持多种命令词自定义方法
> 支持随时动态增加/删除/修改命令词
MultiNet 对命令词自定义方法没有限制,用户可以通过任意方式(在线/离线)等将所需的命令词按照相应的格式,组成链表发给 MultiNet 即可。
我们针对不同客户提供不同的 example 来展示一些命令词的自定义方法,大体分为以下两种。
#### 3.2.1 命令词格式
命令词需要满足特定的格式,具体如下:
- 中文
中文命令词需要使用汉语拼音,并且每个字的拼音拼写间要间隔一个空格。比如“打开空调”,应该写成 "da kai kong tiao"比如“打开绿色灯”需要写成“da kai lv se deng”。
**并且我们也提供相应的工具,供用户将汉字转换为拼音,详细可见:**
- 英文
英文命令词需要使用特定音标表示每个单词的音标间用空格隔开比如“turn on the light”需要写成“TkN nN jc LiT”。
**我们提供了具体转换规则和工具,详细可以参考[英文转音素工具](../../tool/multinet_g2p.py) 。**
#### 3.2.2 离线设置命令词
MultiNet 支持多种且灵活的命令词设置方式,用户无论通过那种方式编写命令词(代码/网络/文件),只需调用相应的 API 即可。
在这里我们提供两种常见的命令词添加方法。
- 编写 `menuconfig` 进行添加
可以参考 ESP-Skainet 中 example 通过 `idf.py menuconfig -> ESP Speech Recognition-> Add Chinese speech commands/Add English speech commands` 添加命令词。
![menuconfig_add_speech_commands](../img/menuconfig_add_speech_commands.png)
请注意单个 Command ID 可以支持多个短语,比如“打开空调”和“开空调”表示的意义相同,则可以将其写在同一个 Command ID 对应的词条中,用英文字符“,”隔开相邻词条(“,”前后无需空格)。
然后通过在代码里调用以下 API 即可:
```
/**
* @brief Update the speech commands of MultiNet by menuconfig
*
* @param multinet The multinet handle
*
* @param model_data The model object to query
*
* @param langugae The language of MultiNet
*
* @return
* - ESP_OK Success
* - ESP_ERR_INVALID_STATE Fail
*/
esp_err_t esp_mn_commands_update_from_sdkconfig(esp_mn_iface_t *multinet, const model_iface_data_t *model_data);
```
- 通过自己创建命令词进行添加
可以参考 ESP-Skainet 中 example 了解这种添加命令词的方法。
该方法中,用户直接在代码中编写命令词,并传给 MultiNet在实际开发和产品中用户可以通过网络/UART/SPI等多种可能的方式传递所需的命令词并随时更换命令词。
#### 3.2.3 在线设置命令词
MultiNet 支持在运行过程中在线动态添加/删除/修改命令词,该过程无须更换模型和调整参数。具体可以参考 ESP-Skainet 中 example。
具体API说明请参考 [esp_mn_speech_commands](../../src/esp_mn_speech_commands.c)
## 4. 运行命令词识别
命令词识别需要和 ESP-SR 中的声学算法模块AFEAFE中需使能唤醒WakeNet一起运行。关于 AFE 的使用,请参考文档:
[AFE 介绍及使用](../audio_front_end/README_CN.md)
当用户配置完成 AFE 后,请按照以下步骤配置和运行 MultiNet
### 4.1 MultiNet 初始化
- 模型加载与初始化   
请参考[flash_model](../flash_model/README_CN.md)
- 设置命令词
请参考上文 #3
### 4.2 MultiNet 运行
当用户开启 AFE 且使能 WakeNet 后,则可以运行 MultiNet。且有以下几点要求
> 传入帧长和 AFE fetch 帧长长度相等
> 支持音频格式为 16KHz16bit单通道。AFE fetch 拿到的数据也为这个格式
- 确定需要传入 MultiNet 的帧长
```
int mu_chunksize = multinet->get_samp_chunksize(model_data);
```
`mu_chunksize` 是需要传入 MultiNet 的每帧音频的 `short` 型点数,这个大小和 AFE 中 fetch 的每帧数据点数完全一致。
- MultiNet detect
我们将 AFE 实时 `fetch` 到的数据送入以下 API
```
esp_mn_state_t mn_state = multinet->detect(model_data, buff);
```
`buff` 的长度为 `mu_chunksize * sizeof(int16_t)`
### 4.3 MultiNet 识别结果
命令词识别支持两种基本模式:
> 单次识别
> 连续识别
命令词识别必须和唤醒搭配使用,当唤醒后可以运行命令词的检测。
命令词模型在运行时,会实时返回当前帧的识别状态 `mn_state`,目前分为以下几种识别状态:
- ESP_MN_STATE_DETECTING
该状态表示目前正在识别中,还未识别到目标命令词。
- ESP_MN_STATE_DETECTED
该状态表示目前识别到了目标命令词,此时用户可以调用 `get_results` 接口获取识别结果。
```
esp_mn_results_t *mn_result = multinet->get_results(model_data);
```
识别结果的信息存储在 `get_result` API 的返回值中,返回值的数据类型如下:
```
typedef struct{
esp_mn_state_t state;
int num; // The number of phrase in list, num<=5. When num=0, no phrase is recognized.
int phrase_id[ESP_MN_RESULT_MAX_NUM]; // The list of phrase id.
float prob[ESP_MN_RESULT_MAX_NUM]; // The list of probability.
} esp_mn_results_t;
```
- 其中 `state` 为当前识别的状态
- `num`表示识别到的词条数目,`num` <= 5即最多返回 5 个候选结果
- `phrase_id` 表示识别到的词条对应的 Phrase ID
- `prob` 表示识别到的词条识别概率,从大到到小依次排列
用户可以使用 `phrase_id[0]``prob[0]` 拿到概率最高的识别结果。
- ESP_MN_STATE_TIMEOUT
该状态表示长时间未检测到命令词,自动退出。等待下次唤醒。
因此:
当命令词识别返回状态为 `ESP_MN_STATE_DETECTED` 时退出命令词识别,则为单次识别模式;
当命令词识别返回状态为 `ESP_MN_STATE_TIMEOUT` 时退出命令词识别,则为连续识别模式;
## 5. 其他配置和使用
### 5.1 阈值设置
  该功能仍在开发中.

88
docs/sphinx-known-warnings.txt Executable file
View File

@ -0,0 +1,88 @@
# File contains known/allowed Sphinx warnings.
#
# Build will fail if sphinx-warning-log.txt contains any lines
# which are not in this file. Lines are pre-sanitized by
# check_doc_warnings.sh to remove formatting, paths and line numbers.
#
# Warnings in this file must be in the same overall order as the log file.
#
#
# Sphinx known issue https://github.com/sphinx-doc/sphinx/issues/2683
#
# Note: warnings below will be gone after upgrade
# to the following package==version
#
# sphinx==1.8.4
# breathe==4.11.1
#
esp_a2dp_api.inc:line: WARNING: Invalid definition: Expected identifier in nested name. [error at 21]
union esp_a2d_mcc_t::@1 esp_a2d_mcc_t::cie
---------------------^
esp_bt_defs.inc:line: WARNING: Invalid definition: Expected identifier in nested name. [error at 21]
union esp_bt_uuid_t::@0 esp_bt_uuid_t::uuid
---------------------^
#
# Breathe known issue: https://github.com/michaeljones/breathe/issues/405
# Sphinx known issue: https://github.com/sphinx-doc/sphinx/pull/5901
#
# Note: warnings below have been identified after upgrade
# to the following package==version
#
# sphinx==1.8.4
# breathe==4.11.1
#
ulp.rst:line: WARNING: Duplicate declaration, esp_err_t ulp_load_binary(uint32_t load_addr, const uint8_t * program_binary, size_t program_size)
ulp.rst:line: WARNING: Duplicate declaration, esp_err_t ulp_run(uint32_t entry_point)
ulp.rst:line: WARNING: Duplicate declaration, esp_err_t ulp_set_wakeup_period(size_t period_index, uint32_t period_us)
ulp-legacy.rst:line: WARNING: Duplicate declaration, esp_err_t ulp_load_binary(uint32_t load_addr, const uint8_t * program_binary, size_t program_size)
ulp-legacy.rst:line: WARNING: Duplicate declaration, esp_err_t ulp_run(uint32_t entry_point)
ulp-legacy.rst:line: WARNING: Duplicate declaration, esp_err_t ulp_set_wakeup_period(size_t period_index, uint32_t period_us)
README.rst:line: WARNING: Duplicate declaration, esp_err_t ulp_run(uint32_t entry_point)
#
# Issue present only when building on msys2 / mingw32 START >>>
#
esp_spp_api.inc:line: WARNING: Error in type declaration.
If typedef-like declaration:
Type must be either just a name or a typedef-like declaration.
If just a name:
Error in declarator or parameters and qualifiers
Invalid definition: Expected identifier in nested name, got keyword: void [error at 4]
void() esp_spp_cb_t(esp_spp_cb_event_t event, esp_spp_cb_param_t *param)
----^
If typedef-like declaration:
Error in declarator
If pointer to member declarator:
Invalid definition: Expected identifier in nested name. [error at 4]
void() esp_spp_cb_t(esp_spp_cb_event_t event, esp_spp_cb_param_t *param)
----^
If declId, parameters, and qualifiers:
Invalid definition: Expected identifier in nested name. [error at 4]
void() esp_spp_cb_t(esp_spp_cb_event_t event, esp_spp_cb_param_t *param)
----^
If parenthesis in noptr-declarator:
Error in declarator or parameters and qualifiers
If pointer to member declarator:
Invalid definition: Expected identifier in nested name. [error at 5]
void() esp_spp_cb_t(esp_spp_cb_event_t event, esp_spp_cb_param_t *param)
-----^
If declarator-id:
Invalid definition: Expected identifier in nested name. [error at 5]
void() esp_spp_cb_t(esp_spp_cb_event_t event, esp_spp_cb_param_t *param)
-----^
If type alias or template alias:
Invalid definition: Expected identifier in nested name, got keyword: void [error at 4]
void() esp_spp_cb_t(esp_spp_cb_event_t event, esp_spp_cb_param_t *param)
----^
#
# Issue present only when building on msys2 / mingw32 END <<<
#
spi_master.inc:line: WARNING: Duplicate declaration, struct spi_transaction_t spi_transaction_t
spi_slave.inc:line: WARNING: Duplicate declaration, struct spi_slave_transaction_t spi_slave_transaction_t
esp_flash.inc:line: WARNING: Duplicate declaration, struct esp_flash_t esp_flash_t
spi_flash_types.inc:line: WARNING: Duplicate declaration, struct spi_flash_host_driver_t spi_flash_host_driver_t
wear-levelling.rst:line: WARNING: Duplicate declaration, bool esp_vfs_fat_mount_config_t::format_if_mount_failed
wear-levelling.rst:line: WARNING: Duplicate declaration, int esp_vfs_fat_mount_config_t::max_files
wear-levelling.rst:line: WARNING: Duplicate declaration, size_t esp_vfs_fat_mount_config_t::allocation_unit_size
wear-levelling.rst:line: WARNING: Duplicate declaration, esp_vfs_fat_mount_config_t

18
docs/utils.sh Executable file
View File

@ -0,0 +1,18 @@
# Bash helper functions for adding SSH keys
function add_ssh_keys() {
local key_string="${1}"
mkdir -p ~/.ssh
chmod 700 ~/.ssh
echo -n "${key_string}" >~/.ssh/id_rsa_base64
base64 --decode --ignore-garbage ~/.ssh/id_rsa_base64 >~/.ssh/id_rsa
chmod 600 ~/.ssh/id_rsa
}
function add_doc_server_ssh_keys() {
local key_string="${1}"
local server_url="${2}"
local server_user="${3}"
add_ssh_keys "${key_string}"
echo -e "Host ${server_url}\n\tStrictHostKeyChecking no\n\tUser ${server_user}\n" >>~/.ssh/config
}

View File

@ -1,64 +0,0 @@
#Espressif Speech Wake-up Solution Customization Process
---
#### 1.1 Speech Wake Word Customization Process
Espressif provides users with the offline wake word customization service, which allows users to use both publicly available wake words (such as "Hi Lexin", ”Alexa”, and “Espressif”) and customized wake words.
1. If you want to use publicly available wake words for commercial use
- Please check the wake words provided in [esp-sr](https://github.com/espressif/esp-sr);
- We will continue to provide more and more wake words that are free for commercial use.
2. If you want to use custom wake words, we can also provide the offline wake word customization service.
- If you provide a training corpus
- It must consist of at least 20,000 qualified corpus entries (see the section below for detailed requirements);
- It will take two to three weeks for Espressif to train and optimize the corpus after the hardware design meets our requirement;
- It will be delivered in a static library of wake word;
- Espressif will charge training fees based on the scale of your production.
- Otherwise
- Espressif will collect and provide all the training corpus;
- Espressif will deliver a static library file of successfully trained wake word to you, but won't share the corpus;
- It will take around three weeks to collect and train the corpus;
- Espressif will charge training fees (corpus collecting fees included) based on the scale of your production.
- The above time is subject to change depending on the project.
- Espressif will only charge a one-time customization fee depending on the number of wake words you customize and the scale of your production, and will not charge license fees for the quantity and time of use. Please email us at [sales@espressif.com](sales@espressif.com) for details of the fee.
3. If you want to use offline command words
- Please set them by yourself referring to [esp-sr](https://github.com/espressif/esp-sr/tree/c5896943ea278195968c93c8b3466c720e641ebc/speech_command_recognition) algorithm. They do not need additional customization.
- Similar to speech wake words, the effect of command words is also related to hardware designs, so please refer to *Espressif MIC Design Guidelines*.
--------
#### 2.1 Requirements on Corpus
As mentioned above, you can provide your own training corpus for Espressif. Below are the requirements.
1. Audio file format
- Sample rate: 16 kHz
- Encoding: 16-bit signed int
- Channel: mono
- Format: WAV
2. Sampling environment
- Room with an ambient noise lower than 30 dB and reverberation less than 0.3 s, or a professional audio room (recommended).
- Recording device: high-fidelity microphone.
- The whole product is strongly recommended.
- The development board of your product also works when there is no cavity structure.
- Record in 16 kHz, and don't use **resampling**.
- At the recording site, pay attention to the impact of reverberation interference in a closed environment.
- Collect samples with multiple recording devices at the same time (recommended).
- For example, postion the devices at 1 m and 3 m away.
- So more samples are collected with the same number of time and participants.
3. Sample distribution
- Sample size: 500. Males and females should be close to 1:1.
- The number of children under 12 years old invloved varies from product to product, but the percentage should be no less than 15%.
- If there are requirements for certain languages or dialects, special corpus samples need to be provided.
- It is recommended to name the samples according to the age, gender, and quantity of the collected samples, such as HiLeXin\_male\_B\_014.wav, and ABCD represents different age groups.
#### 2.2 Hareware Design Guidelines
1. Please refer to *Espressif MIC Design Guidelines*.

View File

@ -1,55 +0,0 @@
# wakeNet
wakeNet, which is a wake word engine built upon neural network, is specially designed for low-power embedded MCUs. Now, the wakeNet model supports up to 5 wake words.
## Overview
Please see the flow diagram of wakeNet below:
<center>
<img src="../img/wakenet_workflow.png" width = "800" />
</center>
- Speech Feature:
The wakeNet uses [MFCC](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum) to obtain the features of the input audio clip (16 KHz, 16 bit, single track). The window width and step width of each frame of the audio clip are both 30 ms.
- Neural Network:
Now, the neural network structure has been updated to the sixth edition, among which,
- wakeNet1,wakeNet2,wakeNet3,wakeNet4,wakeNet6,wakeNet7 had been out of use.
- wakeNet5 only support ESP32 chip.
- wakeNet8,wakeNet9 only support ESP32S3 chip, which are built upon the [Dilated Convolution](https://arxiv.org/pdf/1609.03499.pdf) structure.
Note thatThe network structure of wakeNet5,wakeNet5X2 and wakeNet5X3 is same, but the parameter of wakeNetX2 and wakeNetX3 is more than wakeNet5. Please refer to [Resource Occupancy](#performance-test) for details.
- Keyword Triggering Method
For continuous audio stream, we calculate the average recognition results (M) for several frames and generate a smoothing prediction result, to improve the accuracy of keyword triggering. Only when the M value is larger than the set threshold, a triggering command is sent.
The following table shows the models supported by Espressif SoCs:
| SoCs | wakeNet5 | wakeNet8 | wakeNet9 |
| :-------------------------------------------------------- | :----------: | :---------: | :---------: |
|ESP32|Yes|No|No|
|ESP32S3|No|Yes|Yes|
## Use wakeNet
- How to select the wakeNet model
Please refer to [Flash model 介绍](../flash_model/README.md).
- How to run wakeNet
wakeNet is currently included in the [AFE](../audio_front_end/README.md), which is running by default, and returns the detect results through the AFE fetch interface.
If users wants to close wakeNet, please use:
```
afe_config.wakeNet_init = False.
```
## Performance Test
Please refer to [Performance_test](../performance_test/README.md).
## Wake Word Customization
For details on how to customize your wake words, please see [Espressif Speech Wake Word Customization Process](ESP_Wake_Words_Customization.md).

View File

@ -1,58 +0,0 @@
# WakeNet [[English]](./README.md)
WakeNet是一个基于神经网络为低功耗嵌入式MCU设计的的唤醒词模型目前支持5个以内的唤醒词识别。
## Overview
WakeNet的流程图如下
<center>
<img src="../img/wakenet_workflow.png" width = "800" />
</center>
- speech features
我们使用[MFCC](https://en.wikipedia.org/wiki/Mel-frequency_cepstrum)方法提取语音频谱特征。输入的音频文件采样率为16KHz单声道编码方式为signed 16-bit。每帧窗宽和步长均为30ms。
- neural network
神经网络结构已经更新到第6版其中
- wakeNet1,wakeNet2,wakeNet3,wakeNet4已经停止使用。
- wakeNet5应用于ESP32芯片。
- wakeNet8和wakeNet9应用于ESP32S3芯片模型基于 [Dilated Convolution](https://arxiv.org/pdf/1609.03499.pdf) 结构。
注意WakeNet5,WakeNet5X2 和 WakeNet5X3 的网络结构一致,但是 WakeNet5X2 和 WakeNet5X3 的参数比 WakeNet5 要多。请参考 [性能测试](#性能测试) 来获取更多细节。
- keyword trigger method
对连续的音频流为准确判断关键词的触发我们通过计算若干帧内识别结果的平均值M来判断触发。当M大于大于指定阈值发出触发的命令。
以下表格展示在不同芯片上的模型支持:
![wakent_model](../img/WakeNet_model.png)
## WakeNet使用
- WakeNet 模型选择
WakeNet 模型选择请参考 [Flash model 介绍](../flash_model/README_CN.md) 。
对于自定义的唤醒词,请参考[乐鑫语音唤醒词定制流程](乐鑫语音唤醒词定制流程.md)。
- WakeNet 运行
WakeNet 目前包含在语音前端算法 [AFE](../audio_front_end/README_CN.md) 中,默认为运行状态,并将识别结果通过 AFE fetch 接口返回。
如果用户需要关掉 WakeNet请在 AFE 配置时选择:
```
afe_config.wakenet_init = False.
```
即可停止运行 WakeNet。
## 性能测试
具体请参考 [Performance_test](../performance_test/README.md)。
## 唤醒词定制
如果需要定制唤醒词,请参考[乐鑫语音唤醒词定制流程](乐鑫语音唤醒词定制流程.md)。

View File

@ -1,78 +0,0 @@
# 乐鑫语音唤醒方案客户定制流程 [[English]](./ESP_Wake_Words_Customization.md)
#### 一、离线唤醒词定制服务
乐鑫提供 离线语音唤醒词 定制服务,详情如下:
1. “嗨乐鑫“”你好小鑫”“你好小智”和“嗨Jeson” 等官方公开的唤醒词,客户可直接商用
- 如 ADFASR Demo 提供的离线命令词提供
- 乐鑫会逐渐开放更多的商用 Free 关键词
2. 除官方开放的唤醒词,可接受客户定制服务,分如下两种情况
- 如果客户提供 唤醒词语料
- 需要提供大于 1.5 万条合格的语料(语料需求见下文)
- 语料提供给乐鑫后,需要 23 周进行模型训练及调优
- 根据量级收取少量模型定制费用
- 如果客户不提供 唤醒词语料
- 所有训练语料由乐鑫采集提供(训练前、后,乐鑫不会泄露客户语料)
- 语料提供给乐鑫后,需要 23 周进行模型训练及调优
- 根据量级收取少量模型定制费用(语料采集费用另收)
- 费用收取具体定价和定制时间,烦请邮件至 sales@espressif.com 协议商定
- 收费取决于 唤醒词定制的数量 以及 产品量产数量
3. 对于乐鑫唤醒词模型:
- 目前单个模型最多支持5个及以内的唤醒词识别
- 每个唤醒词通常由 3-6 音节组成比如“hi乐鑫”“Alexa”“小爱同学”“你好天猫”等
- 可多个唤醒模型一起使用,具体需根据客户应用的资源消耗确定
#### 二、训练语料要求
客户可自备训练语料或向第三方采购,对于语料有以下要求
- 语料音频格式要求
- 采样率(sample rate)16 KHz
- 编码encoding)16-bit signed int
- 通道数channelmono
- 格式wav
- 语料采集要求
- 采样人数:最好样本可以大于 500 人,其中男女,年龄分布均衡,儿童不小于 100 人
- 采样环境:环境噪声低(< 40 dB建议在语音室等专业环境下录制
- 录制场景:距离麦克风 1 m 处每人录制 15 遍,其中 5 遍快语速5 遍正常语速5 遍慢语速;距离麦克风 3 m 处每人录制 15 遍,其中 5 遍快语速5 遍正常语速5 遍慢语速
- 录制设备:高保真麦克风
- 样本命名需体现样本信息:如 female_age_fast_id.wav 或有单独表格记录每个样本的年龄,性别等信息
#### 三、硬件设计与测试
语音唤醒效果与硬件设计以及腔体结构有很大关系,为确保硬件设备设计合理,请认真阅读以下内容
- 硬件设计要求
- 对于各类语音音箱类设计,乐鑫可提供 原理图PCB 等设计参考客户可以根据自身具体需求设计修改设计完毕后乐鑫可提供Review服务避免常见设计问题。
- 腔体结构,最好有专门的声学人员参与设计,乐鑫不提供 ID 设计类的参考,客户可以市场上主流音箱设计为参考
> 例如 天猫精灵、小度音箱、谷歌音箱等
2. 硬件设计好后,客户可通过以下简单测试,验证硬件设计效果(下列测试都是基于语音室环境,客户可以根据自身测试环境做调整)
- 录音测试,验证 MIC、codec 录音增益以及失真情况
- 音源 90 dB距离 0.1 m 播放样本,调节增益,保证录音样本不饱和
- 使用扫频文件0~20 KHz使用 16 KHz 采样率录音,音频不会出现明显频率混叠
- 录制 100 个语音样本,使用公开的云端语音识别端口识别,识别率达到指定标准
- 播音测试,验证 功率放大器(PA、喇叭的失真情况
- 测试PA功率 @1% 总谐波失真THD
- 语音算法测试,验证 AEC、BFM、NS 效果
- 首先需要注意下参考信号延时,不同的 AEC 算法有不同的要求
- 以实际产品场景为测试指标,例如 MIC 播放 85DB-90DB 大梦想家.wav, 设备回采
- 保存回声参考信号、回声消除后的信号分析,对比查看 AEC、NS、BFM 等效果
- DSP性能测试验证DSP参数是否合适同时尽可能减少DSP算法中的非线性失真
- 降噪(Noise suppression)算法性能测试
- 回声消除Acoustic Echo Cancellation算法性能测试
- 语音增强Speech Enhancement算法性能测试
3. 硬件设计完毕后,**可寄送** 1-2 台硬件至乐鑫,乐鑫会基于客户整机做唤醒词性能调优

18
docs/zh_CN/404.rst Normal file
View File

@ -0,0 +1,18 @@
:orphan:
无法找到该网页
==============
:link_to_translation:`en:[English]`
.. note::
抱歉,无法找到您想访问的页面。
请使用左侧菜单,浏览其他文档内容。您也可以使用菜单上方的搜索框,搜索您想查看的内容。
.. figure:: ../_static/404-page__cn.svg
:align: center
:alt: 抱歉,无法找到您想访问的页面。
:figclass: align-center
* :ref:`genindex`

View File

@ -0,0 +1,4 @@
{% extends '!layout.html' %}
{% block comments %}
<p style="text-align:center"><a href="https://www.espressif.com/zh-hans/company/documents/documentation_feedback?docId=4846&sections={{ title|striptags|e }} ({{ pagename }})&version={{ release }} ({{ version }})">提供有关此文档的反馈</a></p>
{% endblock %}

View File

@ -0,0 +1,69 @@
乐鑫麦克风设计指南
==================
:link_to_translation:`en:[English]`
基于乐鑫的 S3 系列语音开发板,对于整机 Mic 设计要求如下:
麦克风电器性能推荐
------------------
#. 麦克类型:全向型 MEMS 麦克风
#. 灵敏度
- 1Pa 声压下模拟麦灵敏度不低于 -38dBV数字麦灵敏度要求不低于 -26dB。
- 公差控制在 ±2dB对于麦克阵列推荐采用 ±1dB 公差。
#. 信噪比(SNR)
- 信噪比- 信噪比不低于 62dB推荐 >64dB
- 频率响应在 50~16KHz 范围内的波动在 ±3dB 之内。
- 麦克风NENS MIC的 PSRR 应大于 55 dB
结构设计建议
----------------
#. 麦克孔孔径或宽度推荐大于 1mm拾音管道尽量短腔体尽可能小保证麦克和结构组件配合的谐振频率在 9KHz 以上。
#. 拾音孔深度和直径比小于 2:1壳体厚度推荐1mm如果壳体过厚需增大开孔面积。
#. 麦克孔上需通过防尘网进行保护。
#. 麦克风与设备外壳之间必须加硅胶套或泡棉等进行密封和防震,需进行过盈配合设计,以保证麦克的密封性。
#. 麦克孔不能被遮挡,底部拾音的麦克孔需结构上增加凸起,避免麦克孔被桌面等遮挡。
#. 麦克需远离喇叭等会产生噪音或振动的物体摆放,且与喇叭音腔之间通过橡胶垫等隔离缓冲。
麦克阵列设计推荐
-----------------------
#. 麦克类型:全向型硅麦,同一个阵列内的麦克推荐同一厂家同一型号,不建议混用。
#. 麦克阵列中各麦克灵敏度差异在 3dB 之内。
#. 相位差:多麦克阵列中麦克之间的相位差控制在 10° 以内。
#. 麦克阵列中各麦克的结构设计,推荐采用相同的设计,以保证结构设计的一致性。
#. 2 MIC方案麦克间距要求 4~6.5cm,连接两个麦克风的轴线应平行于水平线,且两个麦克的中心尽量靠近产品水平方向的中心。
#. 3 MIC方案3 个麦克风等间距并且成正圆分布(夹⻆互成 120度间距要求 4~6.5cm。
麦克风结构密封性
----------------
用橡皮泥等材料封堵麦克拾音孔密封前后麦克风采集信号的幅度衰减25dB合格推荐30dB。测试方法
#. 麦克风正上方 0.5 米处,播放白噪声,麦克风处音量 90dB。
#. 使用麦克风阵列录制 10s 以上存储为录音文件A。
#. 用橡皮泥等材料封堵麦克拾音孔,使用麦克风阵列录制 10s 以上存储为录音文件B。
#. 对比两个文件的频谱需保证100~8KHz频段内整体衰减 25dB 以上。
回声参考信号设计
----------------
#. 回声参考信号推荐尽量靠近喇叭侧,推荐从 DA 后级 PA 前级回采。
#. 扬声器音量最大时,输入到麦克的回声参考信号不能有饱和失真,最大音量下喇叭功放输出 THD 满足100Hz 小于 10%200Hz 小于 6%350Hz 以上频率,小于 3% 。
#. 扬声器音量最大时,麦克处拾音的声压不超过 102dB@1KHz。
#. 回声参考信号电压不超过 ADC 的最大允许输入电压,电压过高需增加衰减电路。
#. 从D类功放输出引参考回声信号需增加低通滤波器滤波器的截止频率推荐 >22KHz。
#. 音量最大播放时,回采信号峰值 -3 到 -5dB。
麦克风阵列一致性验证
----------------------------
要求各个麦克风采样信号幅度相差小于 3dB测试方法
#. 麦克风正上方 0.5 米处,播放白噪声,麦克风处音量 90dB 。
#. 使用麦克风阵列录制 10s 以上,查看各 mic 录音幅度和音频采样率是否一致。

View File

@ -0,0 +1,411 @@
Audio Front-end 框架
====================
:link_to_translation:`en:[English]`
乐鑫 Audio Front-end(AFE) 算法框架由乐鑫 AI 实验室自主开发。该框架基于 ESP32 系列芯片,能够提供高质量并且稳定的音频数据。
概述
----
乐鑫 AFE 框架以最便捷的方式基于乐鑫的 ESP32 系列芯片进行语音前端处理。使用乐鑫 AFE 框架,您可以获取高质量且稳定的音频数据,从而更加方便地构建唤醒或语音识别等应用。
乐鑫 AFE 的功能分为两套:
#. 针对语音识别场景;
#. 针对语音通话场景。如下所示:
- 语音识别场景
.. figure:: ../../_static/AFE_SR_overview.png
:alt: overview
- 语音通话场景
.. figure:: ../../_static/AFE_VOIP_overview.png
:alt: overview
乐鑫 AFE 的数据流也相应分为两种场景,如下所示:
- 语音识别场景
.. figure:: ../../_static/AFE_SR_workflow.png
:alt: overview
工作流程如下:
#. 使用 **ESP_AFE_SR_HANDLE**进行AFE 的创建和初始化(``voice_communication_init`` 需配置为 false )
#. AFE feed输入音频数据feed 内部会先进行 AEC 算法处理
#. 内部: 进行 BSS/NS 算法处理
#. AFE fetch返回处理过的单通道音频数据和相关信息 fetch 内部会进行 VAD 处理,以及唤醒词的检测,具体行为取决于用户对 ``afe_config_t`` 结构体的配置。(注: ``wakenet_init````voice_communication_init`` 不可同时配置为 true)
- 语音通话场景
.. figure:: ../../_static/AFE_VOIP_workflow.png
:alt: overview
工作流程如下:
#. 使用 **ESP_AFE_VC_HANDLE** 进行AFE 的创建和初始化 (``voice_communication_init`` 需配置为 true )
#. AFE feed输入音频数据feed 内部会先进行 AEC 算法处理
#. 内部: 首先进行 BSS/NS 算法处理若为双麦随后还会进行MISO 算法处理;
#. AFE fetch返回处理过的单通道音频数据和相关信息。其中会进行AGC非线性放大具体增益值取决于用户对 ``afe_config_t`` 结构体的配置若为双麦在AGC之前还会进行降噪处理。(注: ``wakenet_init````voice_communication_init`` 不可同时配置为 true)
.. note::
``afe->feed()````afe->fetch()`` 对用户可见, ``Internal BSS/NS/MISO Task`` 对用户不可见。
* AEC 在 afe->feed() 函数中运行;若 aec_init 配置为 false 状态BSS/NS 将会在 afe->feed() 函数中运行。
* BSS/NS/MISO 为 AFE 内部独立 Task 进行处理;
* VAD/WakeNet 的结果,以及处理后的单通道音频,通过 afe->fetch() 函数获取。
选择 AFE Handle
~~~~~~~~~~~~~~~
目前 AFE 支持单麦和双麦两种应用场景,并且可对算法模块进行灵活配置。单麦场景内部 Task 为 NS 处理,双麦场景内部 Task 为 BSS 处理,双麦场景若配置为语音通话(即: ``wakenet_init=false, voice_communication_init=true``),则会再增加一个 MISO 的内部 Task。
对于AFE handle的获取语音识别场景与语音通话场景略有差异
- 语音识别
::
esp_afe_sr_iface_t *afe_handle = &ESP_AFE_SR_HANDLE;
- 语音通话
::
esp_afe_sr_iface_t *afe_handle = &ESP_AFE_VC_HANDLE;
输入音频
~~~~~~~~
目前 AFE 支持单麦和双麦两种应用场景,可根据 ``afe->feed()`` 的音频,配置相应的音频通道数。修改方式:在宏 ``AFE_CONFIG_DEFAULT()`` 中对 ``pcm_config`` 结构体成员进行配置修改,其支持如下几种配置组合 (注:一定要满足 ``total_ch_num = mic_num + ref_num``)
::
total_ch_num=1, mic_num=1, ref_num=0
total_ch_num=2, mic_num=1, ref_num=1
total_ch_num=2, mic_num=2, ref_num=0
total_ch_num=3, mic_num=2, ref_num=1
(注解: total_ch_num: 总通道数mic_num: 麦克风通道数ref_num: 参考回路通道数)
对于 AEC目前只支持单回路故 ref_num 的值只能为 0 或 1
- AFE 单麦场景
- 输入音频格式为 16KHz, 16bit, 双通道 (1个通道为 mic 数据,另一个通道为参考回路) ; 若不需要 AEC,音频不包含参考回路则可只包含1个通道 mic 数据ref_num 设置为0。
- 输入数据帧长,会根据用户配置的算法模块不同而有差异, 用户可以使用 ``afe->get_feed_chunksize`` 来获取需要的采样点数目(采样点数据类型为 int16
数据排布如下:
.. figure:: ../../_static/AFE_mode_0.png
:alt: input data of single MIC
:height: 0.7in
- AFE 双麦场景
- 输入音频格式为 16KHz, 16bit, 三通道;若不需要 AEC,音频不包含参考回路,则可只包含两个通道 mic 数据ref_num 设置为0。
- 输入数据帧长,会根据用户配置的算法模块不同而有差异,用户可以使用 ``afe->get_feed_chunksize`` 来获取需要填充的数据量
数据排布如下:
.. figure:: ../../_static/AFE_mode_other.png
:alt: input data of dual MIC
:height: 0.75in
注意:换算成数据量大小为: ``afe->get_feed_chunksize * 通道数 * sizeof(short)``
AEC 简介
~~~~~~~~
AEC (Acoustic Echo Cancellation) 算法最多支持双麦处理,能够有效的去除 mic 输入信号中的自身播放声音。从而可以在自身播放音乐的情况下进行很好的语音识别等应用。
NS 简介
~~~~~~~
NS (Noise Suppression)
算法支持单通道处理,能够对单通道音频中的非人声噪声进行抑制,尤其针对稳态噪声,具有很好的抑制效果。
BSS 简介
~~~~~~~~
BSS (Blind Source Separation)
算法支持双通道处理,能够很好的将目标声源和其余干扰音进行盲源分离,从而提取出有用音频信号,保证了后级语音的质量。
MISO 简介
~~~~~~~~~
MISO (Multi Input Single Output)
算法支持双通道输入,单通道输出。用于在双麦场景,没有唤醒使能的情况下,选择信噪比高的一路音频输出。
VAD 简介
~~~~~~~~
VAD (Voice Activity Detection) 算法支持实时输出当前帧的语音活动状态。
AGC 简介
~~~~~~~~
AGC (Automatic Gain Control)
动态调整输出音频的幅值,当弱信号输入时,放大输出幅度;当输入信号达到一定强度时,压缩输出幅度。
WakeNet or Bypass 简介
~~~~~~~~~~~~~~~~~~~~~~
用户可以选择是否在 AFE 中进行唤醒词的识别。当用户调用 ``afe->disable_wakenet(afe_data)`` 后,则进入 Bypass 模式AFE 模块不会进行唤醒词的识别。
输出音频
~~~~~~~~
AFE 的输出音频为单通道数据。在语音识别场景若WakeNet 开启的情况下AFE 会输出有目标人声的单通道数据。在语音通话场景,将会输出信噪比更高的单通道数据。
.. only:: html
快速开始
--------
定义 afe_handle
~~~~~~~~~~~~~~~~~~
``afe_handle`` 是用户后续调用 afe 接口的函数句柄。所以第一步需先获得 ``afe_handle``
- 语音识别
::
esp_afe_sr_iface_t *afe_handle = &ESP_AFE_SR_HANDLE;
- 语音通话
::
esp_afe_sr_iface_t *afe_handle = &ESP_AFE_VC_HANDLE;
配置 afe
~~~~~~~~~~~
获取 afe 的配置:
::
afe_config_t afe_config = AFE_CONFIG_DEFAULT();
可调整 ``afe_config`` 中各算法模块的使能及其相应参数:
::
#define AFE_CONFIG_DEFAULT() { \
.aec_init = true, \
.se_init = true, \
.vad_init = true, \
.wakenet_init = true, \
.voice_communication_init = false, \
.voice_communication_agc_init = false, \
.voice_communication_agc_gain = 15, \
.vad_mode = VAD_MODE_3, \
.wakenet_model_name = NULL, \
.wakenet_mode = DET_MODE_2CH_90, \
.afe_mode = SR_MODE_LOW_COST, \
.afe_perferred_core = 0, \
.afe_perferred_priority = 5, \
.afe_ringbuf_size = 50, \
.memory_alloc_mode = AFE_MEMORY_ALLOC_MORE_PSRAM, \
.agc_mode = AFE_MN_PEAK_AGC_MODE_2, \
.pcm_config.total_ch_num = 3, \
.pcm_config.mic_num = 2, \
.pcm_config.ref_num = 1, \
}
- aec_init: AEC 算法是否使能。
- se_init: BSS/NS 算法是否使能。
- vad_init: VAD 是否使能 ( 仅可在语音识别场景中使用 )
- wakenet_init: 唤醒是否使能。
- voice_communication_init: 语音通话是否使能。与 wakenet_init
不能同时使能。
- voice_communication_agc_init: 语音通话中AGC是否使能。
- voice_communication_agc_gain: AGC的增益值单位为dB。
- vad_mode: VAD 检测的操作模式,越大越激进。
- wakenet_model_name: 宏 ``AFE_CONFIG_DEFAULT()`` 中该值默认为NULL。使用 ``idf.py menuconfig`` 选择了相应的唤醒模型后,在调用 ``afe_handle->create_from_config`` 之前,需给该处赋值具体的模型名字,类型为字符串形式。唤醒模型的具体说明,详见: `flash_model <../flash_model/README_cn.md>`__ (注意:示例代码中,使用了 esp_srmodel_filter() 获取模型名字,若 menuconfig 中选择了多个模型共存,该函数将会随机返回一个模型名字)
- wakenet_mode: 唤醒的模式。对应为多少通道的唤醒根据mic通道的数量选择
- afe_mode: 乐鑫 AFE 目前支持 2 种工作模式分别为SR_MODE_LOW_COST,SR_MODE_HIGH_PERF。详细可见 afe_sr_mode_t 枚举。
- SR_MODE_LOW_COST: 量化版本,占用资源较少。
- SR_MODE_HIGH_PERF: 非量化版本,占用资源较多。
**ESP32 芯片,只支持模式 SR_MODE_HIGH_PERF; ESP32S3 芯片,两种模式均支持**
- afe_perferred_core: AFE 内部 BSS/NS/MISO 算法,运行在哪个 CPU 核。
- afe_perferred_priority: AFE 内部 BSS/NS/MISO 算法运行的task优先级。
- afe_ringbuf_size: 内部 ringbuf 大小的配置。
- memory_alloc_mode: 内存分配的模式。可配置三个值:
- AFE_MEMORY_ALLOC_MORE_INTERNAL: 更多的从内部ram分配。
- AFE_MEMORY_ALLOC_INTERNAL_PSRAM_BALANCE: 部分从内部ram分配。
- AFE_MEMORY_ALLOC_MORE_PSRAM: 绝大部分从外部psram分配
- agc_mode: 将音频线性放大的 level 配置,该配置在语音识别场景下起作用,并且在唤醒使能时才生效。可配置四个值:
- AFE_MN_PEAK_AGC_MODE_1: 线性放大喂给后续multinet的音频峰值处为 -5dB。
- AFE_MN_PEAK_AGC_MODE_2: 线性放大喂给后续multinet的音频峰值处为 -4dB。
- AFE_MN_PEAK_AGC_MODE_3: 线性放大喂给后续multinet的音频峰值处为 -3dB。
- AFE_MN_PEAK_NO_AGC: 不做线性放大
- pcm_config: 根据 ``afe->feed()`` 喂入的音频结构进行配置,该结构体有三个成员变量需要配置:
- total_ch_num: 音频总的通道数total_ch_num = mic_num + ref_num。
- mic_num: 音频的麦克风通道数。目前仅支持配置为 1 或 2。
- ref_num: 音频的参考回路通道数,目前仅支持配置为 0 或 1。
创建 afe_data
~~~~~~~~~~~~~~~~
用户使用 ``afe_handle->create_from_config(&afe_config)`` 函数来获得数据句柄这将会在afe内部使用传入的参数即为上面第2步中获得的配置。
::
/**
* @brief Function to initialze a AFE_SR instance
*
* @param afe_config The config of AFE_SR
* @returns Handle to the AFE_SR data
*/
typedef esp_afe_sr_data_t* (*esp_afe_sr_iface_op_create_from_config_t)(afe_config_t *afe_config);
feed 音频数据
~~~~~~~~~~~~~~~~
在初始化 AFE 完成后,用户需要将音频数据使用 ``afe_handle->feed()`` 函数输入到 AFE 中进行处理。
输入的音频大小和排布格式可以参考 **输入音频** 这一步骤。
::
/**
* @brief Feed samples of an audio stream to the AFE_SR
*
* @Warning The input data should be arranged in the format of channel interleaving.
* The last channel is reference signal if it has reference data.
*
* @param afe The AFE_SR object to query
*
* @param in The input microphone signal, only support signed 16-bit @ 16 KHZ. The frame size can be queried by the
* `get_feed_chunksize`.
* @return The size of input
*/
typedef int (*esp_afe_sr_iface_op_feed_t)(esp_afe_sr_data_t *afe, const int16_t* in);
获取音频通道数:
使用 ``afe_handle->get_total_channel_num()`` 函数可以获取需要传入 ``afe_handle->feed()`` 函数的总数据通道数。其返回值等于AFE_CONFIG_DEFAULT()中配置的 ``pcm_config.mic_num + pcm_config.ref_num``
::
/**
* @brief Get the total channel number which be config
*
* @param afe The AFE_SR object to query
* @return The amount of total channels
*/
typedef int (*esp_afe_sr_iface_op_get_total_channel_num_t)(esp_afe_sr_data_t *afe);
fetch 音频数据
~~~~~~~~~~~~~~
用户调用 ``afe_handle->fetch()`` 函数可以获取处理完成的单通道音频以及相关处理信息。
fetch 的数据采样点数目(采样点数据类型为 int16可以通过 ``afe_handle->get_fetch_chunksize`` 获取。
::
/**
* @brief Get the amount of each channel samples per frame that need to be passed to the function
*
* Every speech enhancement AFE_SR processes a certain number of samples at the same time. This function
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
*
* @param afe The AFE_SR object to query
* @return The amount of samples to feed the fetch function
*/
typedef int (*esp_afe_sr_iface_op_get_samp_chunksize_t)(esp_afe_sr_data_t *afe);
``afe_handle->fetch()`` 的函数声明如下:
::
/**
* @brief fetch enhanced samples of an audio stream from the AFE_SR
*
* @Warning The output is single channel data, no matter how many channels the input is.
*
* @param afe The AFE_SR object to query
* @return The result of output, please refer to the definition of `afe_fetch_result_t`. (The frame size of output audio can be queried by the `get_fetch_chunksize`.)
*/
typedef afe_fetch_result_t* (*esp_afe_sr_iface_op_fetch_t)(esp_afe_sr_data_t *afe);
其返回值为结构体指针,结构体定义如下:
::
/**
* @brief The result of fetch function
*/
typedef struct afe_fetch_result_t
{
int16_t *data; // the data of audio.
int data_size; // the size of data. The unit is byte.
int wakeup_state; // the value is wakenet_state_t
int wake_word_index; // if the wake word is detected. It will store the wake word index which start from 1.
int vad_state; // the value is afe_vad_state_t
int trigger_channel_id; // the channel index of output
int wake_word_length; // the length of wake word. It's unit is the number of samples.
int ret_value; // the return state of fetch function
void* reserved; // reserved for future use
} afe_fetch_result_t;
WakeNet 使用
~~~~~~~~~~~~~
当用户在唤醒后需要进行其他操作,比如离线或在线语音识别,这时候可以暂停 WakeNet 的运行,从而减轻 CPU 的资源消耗。
用户可以调用 ``afe_handle->disable_wakenet(afe_data)`` 来停止 WakeNet。当后续应用结束后又可以调用 ``afe_handle->enable_wakenet(afe_data)`` 来开启 WakeNet。
另外ESP32S3 芯片,支持唤醒词切换。(注: ESP32 芯片只支持一个唤醒词,不支持切换)。在初始化 AFE 完成后ESP32S3 芯片可通过 ``set_wakenet()`` 函数切换唤醒词。例如, ``afe_handle->set_wakenet(afe_data, “wn9_hilexin”)`` 切换到“Hi Lexin”唤醒词。具体如何配置多个唤醒词详见 `flash_model <../flash_model/README_CN.md>`__
AEC 使用
~~~~~~~~
AEC 的使用和 WakeNet 相似,用户可以根据自己的需求来停止或开启 AEC。
- 停止 AEC
afe->disable_aec(afe_data);
- 开启 AEC
afe->enable_aec(afe_data);

24
docs/zh_CN/conf.py Executable file
View File

@ -0,0 +1,24 @@
# -*- coding: utf-8 -*-
#
# English Language RTD & Sphinx config file
#
# Uses ../conf_common.py for most non-language-specific settings.
# Importing conf_common adds all the non-language-specific
# parts to this conf module
import sys
import os
sys.path.insert(0, os.path.abspath('..'))
from conf_common import * # noqa: F401, F403 - need to make available everything from common
# General information about the project.
project = u'ESP-SR 用户指南'
copyright = u'2016 - 2022 乐鑫信息科技(上海)股份有限公司'
pdf_title = u'ESP-SR 用户指南'
# Final PDF filename will contains target and version
pdf_file_prefix = u'esp-sr'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
language = 'zh_CN'

View File

@ -0,0 +1,215 @@
模型加载方式
============
:link_to_translation:`en:[English]`
在 esp-sr 中WakeNet 和 MultiNet 均会使用到大量的模型数据,模型数据位于 ``ESP-SR_PATH/model/`` 中。 目前 esp-sr 支持以下模型加载方式:
ESP32
- 从 Flash 中直接加载
ESP32S3
- 从 Flash spiffs 分区加载
- 从外部 SDCard 加载
从而在 ESP32S3 上可以:
- 大大减小用户应用 APP BIN 的大小
- 支持选择最多两个唤醒词
- 支持中文和英文命令词识别在线切换
- 方便用户进行 OTA
- 支持从 SD 卡读取和更换模型,更加便捷且可以缩减项目使用的模组 Flash 大小
- 当用户进行开发时,当修改不涉及模型时,可以避免每次烧录模型数据,大大缩减烧录时间,提高开发效率
模型配置介绍
------------
运行 ``idf.py menuconfig`` 进入 ``ESP Speech Recognition``:
.. figure:: ../../_static/model-1.png
:alt: overview
overview
Model Data Path
~~~~~~~~~~~~~~~
该选项只在 ESP32S3 上可用,表示模型数据的存储位置,支持选择 ``spiffs partition````SD Card``
- ``spiffs partition`` 表示模型数据存储在 Flash spiffs 分区中,模型数据将会从 Flash spiffs 分区中加载
- ``SD Card`` 表示模型数据存储在 SD 卡中,模型数据将会从 SD Card 中加载
Use AFE
~~~~~~~
该选项需要打开,用户无须修改,请保持默认配置。
Use Wakenet
~~~~~~~~~~~~
* 此选项默认打开,当用户只使用 AEC 或者 BSS 等,无须运行 WakeNet 或 MultiNet 时请关闭次选项将会减小工程固件的大小。根据menuconfig列表选择唤醒词模型 ``ESP Speech Recognition -> Select wake words``. 括号中为唤醒词模型的名字你需要在代码用名字切换初始化wakenet.
|select wake wake|
* 如果想加载多个唤醒词以便在代码中进行唤醒词的切换首选选择Load Multiple Wake Words
|multi wake wake|
* 然后按照列表选择多个唤醒词:
|image1|
**注:多唤醒词选项只支持 ESP32S3具体根据客户硬件flash容量选择合适数量的唤醒词。**
更多细节请参考 `WakeNet <../wake_word_engine/README.rst>`__
Use Multinet
~~~~~~~~~~~~~
此选项默认打开。当用户只使用 WakeNet 或者其他算法模块时,请关闭此选项,将会在一些情况下减小工程固件的大小。
ESP32 芯片只支持中文命令词识别。ESP32S3 支持中文和英文命令词识别,且支持中英文识别模型切换。
- Chinese Speech Commands Model
中文命令词识别模型选择。
ESP32 支持:
- None
- chinese single recognition (MultiNet2)
ESP32S3 支持:
- None
- chinese single recognition (MultiNet4.5)
- chinese single recognition (MultiNet4.5 quantized with 8-bit)
- English Speech Commands Model
英文命令词识别模型选择。
该选项不支持 ESP32。
ESP32S3 支持:
- None
- english recognition (MultiNet5 quantized with 8-bit, depends on WakeNet8)
- Add Chinese speech commands
当用户在 ``Chinese Speech Commands Model`` 中选择非 ``None`` 时,需要在该项处添加中文命令词。
- Add English speech commands
当用户在 ``English Speech Commands Model`` 中选择非 ``None`` 时,需要在该项处添加中文命令词。
用户按照需求自定义添加命令词,具体请参考 `MultiNet <../speech_command_recognition/README.md>`__
模型使用
---------
当用户完成以上的配置选择后,应用层请参考 esp-skainet 进行初始化和使用。这里介绍一下模型数据加载在用户工程中的代码实现。 也可以参考代码 `model_path.c <../../src/model_path.c>`__
使用 ESP32
~~~~~~~~~~
当用户使用 ESP32 时,由于只支持从 Flash 中直接加载模型数据,因此代码中模型数据会自动按照地址从 Flash 中读取所需数据。 为了和ESP32S3进行兼容代码中模型的初始化方法是和ESP32S3相同的可参考下面ESP32S3的模型加载API
使用 ESP32S3
~~~~~~~~~~~~~
模型数据存储在 SPIFFS
^^^^^^^^^^^^^^^^^^^^^
- 编写分区表:
::
model, data, spiffs, , SIZE,
其中 SIZE 可以参考在用户使用 idf.py build 编译时的推荐大小,例如:
::
Recommended model partition size: 500K
- 初始化 spiffs 分区 **调用提供的 API** :用户可以直接调用
``esp_srmodel_init()`` API 来初始化 spiffs并返回spiffs中的模型。
- base_path模型的存储 ``base_path````srmodel`` ,不可更改
- partition_label模型的分区 label 为 ``model`` ,需要和 上述分区表中的 ``Name`` 保持一致
完成上述配置后,模型会在工程编译完成后自动生成 ``model.bin`` ,并在用户调用 ``idf.py flash`` 时烧写到 spiffs 分区。
模型存储在 SD Card
^^^^^^^^^^^^^^^^^^
当用户配置 #1.2 模型数据存储位置是 ``SD Card`` 时,用户需要:
- 手动移动模型数据
将模型移动到 SDCard 中,用户完成以上配置后,可以先进行编译,编译完成后将 ``ESP-SR_PATH/model/target/`` 目录下的文件拷贝至 SD 卡的根目录。
- 自定义路径 如果用户想将模型放置于指定文件夹,可以自己修改 ``get_model_base_path()`` 函数,位于 ``ESP-SR_PATH/model/model_path.c``。 比如,指定文件夹为 SD 卡目录中的 ``espmodel``, 则可以修改该函数为:
.. only:: html
::
char *get_model_base_path(void)
{
#if defined CONFIG_MODEL_IN_SDCARD
return "sdcard/espmodel";
#elif defined CONFIG_MODEL_IN_SPIFFS
return "srmodel";
#else
return NULL;
#endif
}
- 初始化 SD 卡
用户需要初始化 SD 卡,来使系统能够记载 SD 卡,如果用户使用 esp-skainet可以直接调用 ``esp_sdcard_init("/sdcard", num);`` 来初始化其支持开发板的 SD 卡。否则,需要自己编写。
完成以上操作后,便可以进行工程的烧录。
.. only:: html
代码中模型初始化与使用
^^^^^^^^^^^^^^^^^^^^^^
::
//
// step1: initialize spiffs and return models in spiffs
//
srmodel_list_t *models = esp_srmodel_init();
//
// step2: select the specific model by keywords
//
char *wn_name = esp_srmodel_filter(models, ESP_WN_PREFIX, NULL); // select wakenet model
char *nm_name = esp_srmodel_filter(models, ESP_MN_PREFIX, NULL); // select multinet model
char *alexa_wn_name = esp_srmodel_filter(models, ESP_WN_PREFIX, "alexa"); // select wakenet with "alexa" wake word.
char *en_mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_ENGLISH); // select english multinet model
char *cn_mn_name = esp_srmodel_filter(models, ESP_MN_PREFIX, ESP_MN_CHINESE); // select english multinet model
// It also works if you use the model name directly in your code.
char *my_wn_name = "wn9_hilexin"
// we recommend you to check that it is loaded correctly
if (!esp_srmodel_exists(models, my_wn_name))
printf("%s can not be loaded correctly\n")
//
// step3: initialize model
//
esp_wn_iface_t *wakenet = esp_wn_handle_from_name(wn_name);
model_iface_data_t *wn_model_data = wakenet->create(wn_name, DET_MODE_2CH_90);
esp_mn_iface_t *multinet = esp_mn_handle_from_name(mn_name);
model_iface_data_t *mn_model_data = multinet->create(mn_name, 6000);
.. |select wake wake| image:: ../../_static/wn_menu1.png
.. |multi wake wake| image:: ../../_static/wn_menu2.png
.. |image1| image:: ../../_static/wn_menu3.png

22
docs/zh_CN/index.rst Normal file
View File

@ -0,0 +1,22 @@
ESP-SR 用户指南
=================
:link_to_translation:`en:[English]`
这里是乐鑫 `ESP-SR <https://github.com/espressif/esp-sr>`__ 本文档将介绍乐鑫以 ESP32 和 ESP32S3 系列芯片为基础推出的 AI 语音解决方案。从前端音频处理,到语音命令词识别,从硬件设计建议,到性能测试方法,全面介绍乐鑫在 AI 语音方面的系统性工作,为用户在乐鑫 ESP32 系列芯片及开发板上构建 AIoT 应用,提供有力参考。
乐鑫 AFE 算法已通过亚马逊 Alexa 内置设备的 Software Audio Front-End 认证。可在语音通话和语音识别等场景下提供高质量音频输入。 AFE 算法中内置的唤醒模块可实现本地语音唤醒功能,且支持唤醒词定制。乐鑫语音命令词识别模型可支持最多 200 条中英文命令词,且可在运行中修改命令词,为应用带来极大灵活性。
基于多年硬件设计与开发经验,乐鑫可为客户提供语音开发板 Review 服务,并乐意为客户自制开发板进行测试和调优,以展现算法最优性能。客户也可按照乐鑫提供的测试方式和自测结果,对开发板和整机产品进行深入评估。
.. toctree::
:hidden:
AFE 声学前端算法 <audio_front_end/README>
唤醒词模型 <wake_word_engine/README>
定制化唤醒词 <wake_word_engine/ESP_Wake_Words_Customization>
语音指令 <speech_command_recognition/README>
模型加载方式 <flash_model/README>
麦克风设计指南 <audio_front_end/Espressif_Microphone_Design_Guidelines>
测试报告 <test_report/README>
性能测试 <performance_test/README>

View File

@ -0,0 +1,164 @@
性能测试
========
:link_to_translation:`en:[English]`
AFE
---
资源占用(ESP32)
~~~~~~~~~~~~~~~
+-----------------+-----------------+-----------------+-----------------+
| algorithm Type | RAM | Average cpu | Frame Length |
| | | loading(compute | |
| | | with 2 cores) | |
+=================+=================+=================+=================+
| AEC(HIGH_PERF) | 114 KB | 11% | 32 ms |
+-----------------+-----------------+-----------------+-----------------+
| NS | 27 KB | 5% | 10 ms |
+-----------------+-----------------+-----------------+-----------------+
| AFE Layer | 73 KB | | |
+-----------------+-----------------+-----------------+-----------------+
资源占用(ESP32S3)
~~~~~~~~~~~~~~~~~
+-----------------+-----------------+-----------------+-----------------+
| algorithm Type | RAM | Average cpu | Frame Length |
| | | loading(compute | |
| | | with 2 cores) | |
+=================+=================+=================+=================+
| AEC(LOW_COST) | 152.3 KB | 8% | 32 ms |
+-----------------+-----------------+-----------------+-----------------+
| AEC(HIGH_PERF) | 166 KB | 11% | 32 ms |
+-----------------+-----------------+-----------------+-----------------+
| BSS(LOW_COST) | 198.7 KB | 6% | 64 ms |
+-----------------+-----------------+-----------------+-----------------+
| BSS(HIGH_PERF) | 215.5 KB | 7% | 64 ms |
+-----------------+-----------------+-----------------+-----------------+
| NS | 27 KB | 5% | 10 ms |
+-----------------+-----------------+-----------------+-----------------+
| MISO | 56 KB | 8% | 16 ms |
+-----------------+-----------------+-----------------+-----------------+
| AFE Layer | 227 KB | | |
+-----------------+-----------------+-----------------+-----------------+
WakeNet
-------
.. _resource-occupancyesp32-1:
资源占用(ESP32)
~~~~~~~~~~~~~~~
+-------------+-------------+-------------+-------------+-------------+
| Model Type | Parameter | RAM | Average | Frame |
| | Num | | Running | Length |
| | | | Time per | |
| | | | Frame | |
+=============+=============+=============+=============+=============+
| Quantised | 41 K | 15 KB | 5.5 ms | 30 ms |
| WakeNet5 | | | | |
+-------------+-------------+-------------+-------------+-------------+
| Quantised | 165 K | 20 KB | 10.5 ms | 30 ms |
| WakeNet5X2 | | | | |
+-------------+-------------+-------------+-------------+-------------+
| Quantised | 371 K | 24 KB | 18 ms | 30 ms |
| WakeNet5X3 | | | | |
+-------------+-------------+-------------+-------------+-------------+
.. _resource-occupancyesp32s3-1:
资源占用(ESP32S3)
~~~~~~~~~~~~~~~~~
+----------------+-------+---------+----------------+--------------+
| Model Type | RAM | PSRAM | Average | Frame Length |
| | | | Running Time | |
| | | | per Frame | |
+================+=======+=========+================+==============+
| Quantised | 50 KB | 1640 KB | 10.0 ms | 32 ms |
| WakeNet8 @ 2 | | | | |
| channel | | | | |
+----------------+-------+---------+----------------+--------------+
| Quantised | 16 KB | 324 KB | 3.0 ms | 32 ms |
| WakeNet9 @ 2 | | | | |
| channel | | | | |
+----------------+-------+---------+----------------+--------------+
| Quantised | 20 KB | 347 KB | 4.3 ms | 32 ms |
| WakeNet9 @ 3 | | | | |
| channel | | | | |
+----------------+-------+---------+----------------+--------------+
性能测试
~~~~~~~~~
+-------------+-------------+-------------+-------------+-------------+
| Distance | Quiet | Stationary | Speech | AEC |
| | | Noise (SNR | Noise (SNR | I |
| | | = 4 dB) | = 4 dB) | nterruption |
| | | | | (-10 dB) |
+=============+=============+=============+=============+=============+
| 1 m | 98% | 96% | 94% | 96% |
+-------------+-------------+-------------+-------------+-------------+
| 3 m | 98% | 96% | 94% | 94% |
+-------------+-------------+-------------+-------------+-------------+
误触发率12小时1次
**Note**: 我们在测试中使用了 ESP32-S3-Korvo V4.0 开发板和 WakeNet9(Alexa) 模型。
MultiNet
--------
.. _resource-occupancyesp32-2:
资源占用(ESP32)
~~~~~~~~~~~~~~~
+-------------+-------------+-------------+-------------+-------------+
| Model Type | Internal | PSRAM | Average | Frame |
| | RAM | | Running | Length |
| | | | Time per | |
| | | | Frame | |
+=============+=============+=============+=============+=============+
| MultiNet 2 | 13.3 KB | 9KB | 38 ms | 30 ms |
+-------------+-------------+-------------+-------------+-------------+
.. _resource-occupancyesp32s3-2:
资源占用(ESP32S3)
~~~~~~~~~~~~~~~~~
+-------------+-------------+-------------+-------------+-------------+
| Model Type | Internal | PSRAM | Average | Frame |
| | RAM | | Running | Length |
| | | | Time per | |
| | | | Frame | |
+=============+=============+=============+=============+=============+
| MultiNet 4 | 16.8KB | 1866 KB | 18 ms | 32 ms |
+-------------+-------------+-------------+-------------+-------------+
| MultiNet 4 | 10.5 KB | 1009 KB | 11 ms | 32 ms |
| Q8 | | | | |
+-------------+-------------+-------------+-------------+-------------+
| MultiNet 5 | 16 KB | 2310 KB | 12 ms | 32 ms |
| Q8 | | | | |
+-------------+-------------+-------------+-------------+-------------+
AFE 的性能
~~~~~~~~~~
+-----------+-----------+-----------+-----------+-----------+
| Model | Distance | Quiet | S | Speech |
| Type | | | tationary | Noise |
| | | | Noise | (SNR = 4 |
| | | | (SNR = 4 | dB) |
| | | | dB) | |
+===========+===========+===========+===========+===========+
| MultiNet | 3 m | 98% | 93% | 92% |
| 4 | | | | |
+-----------+-----------+-----------+-----------+-----------+
| MultiNet | 3 m | 94% | 92% | 91% |
| 4 Q8 | | | | |
+-----------+-----------+-----------+-----------+-----------+

View File

@ -0,0 +1,243 @@
MultiNet 介绍
=============
:link_to_translation:`en:[English]`
MultiNet 是为了在 ESP32 系列上离线实现多命令词识别而设计的轻量化模型,目前支持 200 个以内的自定义命令词识别。
* 支持中文和英文命令词识别(英文命令词识别需使用 ESP32S3
* 支持用户自定义命令词
* 支持运行过程中 增加/删除/修改 命令词语
* 最多支持 200 个命令词
* 支持单次识别和连续识别两种模式
* 轻量化,低资源消耗
* 低延时延时500ms内
* 支持在线中英文模型切换(仅 ESP32S3
* 模型单独分区,支持用户应用 OTA
概述
-------
MultiNet 输入为经过前端语音算法AFE处理过的音频格式为 16KHz16bit单声道。通过对音频进行识别则可以对应到相应的汉字或单词。
以下表格展示在不同芯片上的模型支持:
+---------+-----------+-------------+---------------+-------------+
| Chip | ESP32 | ESP32S3 |
+=========+===========+=============+===============+=============+
| Model | MultiNet2 | MultiNet4.5 | MultiNet4.5Q8 | MultiNet5Q8 |
+---------+-----------+-------------+---------------+-------------+
| Chinese | √ | √ | √ | √ |
+---------+-----------+-------------+---------------+-------------+
| English | | | | √ |
+---------+-----------+-------------+---------------+-------------+
用户选择不同的模型的方法请参考 `flash model <../flash_model/README_CN.md>`__
**注:其中以 ``Q8`` 结尾的模型代表模型的 8bit 版本,表明该模型更加轻量化。**
命令词识别原理
-----------------
可以参考以下命令词识别原理:
.. figure:: ../../_static/multinet_workflow.png
:alt: speech_command-recognition-system
speech_command-recognition-system
使用指南
--------
命令词设计要求
~~~~~~~~~~~~~~~
- 中文推荐长度一般为 4-6 个汉字,过短导致误识别率高,过长不方便用户记忆
- 英文推荐长度一般为 4-6 个单词
- 命令词中不支持中英文混合
- 目前最多支持 **200** 条命令词
- 命令词中不能含有阿拉伯数字和特殊字符
- 命令词避免使用常用语
- 命令词中每个汉字/单词的发音相差越大越好
命令词自定义方法
~~~~~~~~~~~~~~~~
* 支持多种命令词自定义方法
* 支持随时动态增加/删除/修改命令词
MultiNet 对命令词自定义方法没有限制,用户可以通过任意方式(在线/离线)等将所需的命令词按照相应的格式,组成链表发给 MultiNet 即可。
我们针对不同客户提供不同的 example 来展示一些命令词的自定义方法,大体分为以下两种。
命令词格式
^^^^^^^^^^
命令词需要满足特定的格式,具体如下:
- 中文
中文命令词需要使用汉语拼音,并且每个字的拼音拼写间要间隔一个空格。比如“打开空调”,应该写成 “da kai kong tiao”比如“打开绿色灯”需要写成“da kai lv se deng”。
- 英文
英文命令词需要使用特定音标表示每个单词的音标间用空格隔开比如“turn on the light”需要写成“TkN nN jc LiT”。
**并且我们也提供相应的工具,供用户将汉字转换为拼音,详细可见:** `英文转音素工具 <../../tool/multinet_g2p.py>`__
.. only:: latex
.. figure:: ../../_static/QR_multinet_g2p.png
:alt: menuconfig_add_speech_commands
离线设置命令词
^^^^^^^^^^^^^^^
MultiNet 支持多种且灵活的命令词设置方式,用户无论通过那种方式编写命令词(代码/网络/文件),只需调用相应的 API 即可。
在这里我们提供两种常见的命令词添加方法。
- 编写 ``menuconfig`` 进行添加
可以参考 ESP-Skainet 中 example 通过 ``idf.py menuconfig -> ESP Speech Recognition-> Add Chinese speech commands/Add English speech commands`` 添加命令词。
.. figure:: ../../_static/menuconfig_add_speech_commands.png
:alt: menuconfig_add_speech_commands
menuconfig_add_speech_commands
请注意单个 Command ID 可以支持多个短语,比如“打开空调”和“开空调”表示的意义相同,则可以将其写在同一个 Command ID 对应的词条中,用英文字符“,”隔开相邻词条(“,”前后无需空格)。
然后通过在代码里调用以下 API 即可:
::
/**
* @brief Update the speech commands of MultiNet by menuconfig
*
* @param multinet The multinet handle
*
* @param model_data The model object to query
*
* @param langugae The language of MultiNet
*
* @return
* - ESP_OK Success
* - ESP_ERR_INVALID_STATE Fail
*/
esp_err_t esp_mn_commands_update_from_sdkconfig(esp_mn_iface_t *multinet, const model_iface_data_t *model_data);
- 通过自己创建命令词进行添加
可以参考 ESP-Skainet 中 example 了解这种添加命令词的方法。
该方法中,用户直接在代码中编写命令词,并传给 MultiNet在实际开发和产品中用户可以通过网络/UART/SPI等多种可能的方式传递所需的命令词并随时更换命令词。
在线设置命令词
^^^^^^^^^^^^^^
MultiNet 支持在运行过程中在线动态添加/删除/修改命令词,该过程无须更换模型和调整参数。具体可以参考 ESP-Skainet 中 example。
具体API说明请参考 `esp_mn_speech_commands <../../src/esp_mn_speech_commands.c>`__
运行命令词识别
--------------
命令词识别需要和 ESP-SR 中的声学算法模块AFEAFE中需使能唤醒WakeNet一起运行。关于 AFE 的使用,请参考文档:
`AFE 介绍及使用 <../audio_front_end/README_CN.md>`__
当用户配置完成 AFE 后,请按照以下步骤配置和运行 MultiNet
MultiNet 初始化
~~~~~~~~~~~~~~~
- 模型加载与初始化   
请参考 `flash_model <../flash_model/README_CN.md>`__
- 设置命令词 请参考上文 #3。
MultiNet 运行
~~~~~~~~~~~~~
当用户开启 AFE 且使能 WakeNet 后,则可以运行 MultiNet。且有以下几点要求
* 传入帧长和 AFE fetch 帧长长度相等
* 支持音频格式为 16KHz16bit单通道。AFE fetch 拿到的数据也为这个格式
- 确定需要传入 MultiNet 的帧长
::
int mu_chunksize = multinet->get_samp_chunksize(model_data);
``mu_chunksize`` 是需要传入 MultiNet 的每帧音频的 ``short`` 型点数,这个大小和 AFE 中 fetch 的每帧数据点数完全一致。
- MultiNet detect
我们将 AFE 实时 ``fetch`` 到的数据送入以下 API
::
esp_mn_state_t mn_state = multinet->detect(model_data, buff);
``buff`` 的长度为 ``mu_chunksize * sizeof(int16_t)``
MultiNet 识别结果
~~~~~~~~~~~~~~~~~
命令词识别支持两种基本模式:
* 单次识别
* 连续识别
命令词识别必须和唤醒搭配使用,当唤醒后可以运行命令词的检测。
命令词模型在运行时,会实时返回当前帧的识别状态
``mn_state`` ,目前分为以下几种识别状态:
- ESP_MN_STATE_DETECTING
该状态表示目前正在识别中,还未识别到目标命令词。
- ESP_MN_STATE_DETECTED
该状态表示目前识别到了目标命令词,此时用户可以调用 ``get_results`` 接口获取识别结果。
::
esp_mn_results_t *mn_result = multinet->get_results(model_data);
识别结果的信息存储在 ``get_result`` API 的返回值中,返回值的数据类型如下:
::
typedef struct{
esp_mn_state_t state;
int num; // The number of phrase in list, num<=5. When num=0, no phrase is recognized.
int phrase_id[ESP_MN_RESULT_MAX_NUM]; // The list of phrase id.
float prob[ESP_MN_RESULT_MAX_NUM]; // The list of probability.
} esp_mn_results_t;
- 其中 ``state`` 为当前识别的状态
- ``num`` 表示识别到的词条数目, ``num`` <= 5即最多返回 5 个候选结果
- ``phrase_id`` 表示识别到的词条对应的 Phrase ID
- ``prob`` 表示识别到的词条识别概率,从大到到小依次排列
用户可以使用 ``phrase_id[0]````prob[0]`` 拿到概率最高的识别结果。
- ESP_MN_STATE_TIMEOUT
该状态表示长时间未检测到命令词,自动退出。等待下次唤醒。
| 因此:
| 当命令词识别返回状态为 ``ESP_MN_STATE_DETECTED`` 时退出命令词识别,则为单次识别模式;
| 当命令词识别返回状态为 ``ESP_MN_STATE_TIMEOUT`` 时退出命令词识别,则为连续识别模式;
其他配置和使用
--------------
阈值设置
~~~~~~~~
  该功能仍在开发中.

View File

@ -0,0 +1,193 @@
测试方法与测试报告
==================
:link_to_translation:`en:[English]`
测试场景
~~~~~~~~
* 房间大小
* 地面大小: 至少 4M*3.2M
* 高度至少: 2.30M
* 房间装饰
* 地板需配有地毯在天花板上配备一些通常在办公室中常见的声学阻尼。在1到2面墙上挂有窗帘防止强反射。
* 房间混响RT601在[125, 8k]范围内要满足0.2-0.7s的要求。
* 不要使用消音室。
* 环境底噪要求:应该 < 35dBA最好是 < 30dBA。
* 温度和湿度要求70+-20 华氏度相对湿度为50%+-20%。
* 设备位置
* 根据产品可能的实际使用方式,确定设备在性能测试时摆放的位置,比如设 备高度、离墙的距离、离地面的距离、角度等。
* 外噪的角度、距离、高度和分贝
* 外噪到设备麦克的角度、距离,外噪距离地面的高度, 在设备麦克处测量到的外噪分贝值。
* 人声的角度、高度、距离和分贝
* 性能测试时播放的测试语音集称为人声。人声到设备 麦克的角度、距离,人声距离地面的高度,在设备麦克处测量到的人声分贝值。
在不同的测试场景中RT60、房间底噪、设备的位置是三个通用因素在这些因素被确定之后将被运用到不同的测试场景中。
唤醒率测试
~~~~~~~~~~
唤醒率测试是指当设备处于待唤醒状态时被唤醒成功的概率。
除通用因素外,通常唤醒率测试还需要确定的因素如表 1 所示。可以根据产品定位设计噪声和人声相对设备同向或者不同向的测试场景,或者多噪声源的测试场景,以及不同的 SNR 场景。
+--------------+----------+----------+----------+----------+----------+----------+------+
| 测试场景编号 | 外噪距离 | 外噪角度 | 外噪分贝 | 人声距离 | 人声角度 | 人声分贝 | SNR |
+==============+==========+==========+==========+==========+==========+==========+======+
| 1 | / | / | <35dBA | 3m | 90° | 54dBA | / |
+--------------+----------+----------+----------+----------+----------+----------+------+
| 2 | 2m | 45° | 45dBA | 3m | 90° | 54dBA | 9dB |
+--------------+----------+----------+----------+----------+----------+----------+------+
| 3 | 2m | 45° | 55dBA | 3m | 90° | 59dBA | 4dB |
+--------------+----------+----------+----------+----------+----------+----------+------+
| 4 | 2m | 45° | 65dBA | 3m | 90° | 64dBA | -1dB |
+--------------+----------+----------+----------+----------+----------+----------+------+
.. figure:: ../../_static/test_reference_position1.png
:align: center
:alt: overview
描述已自动生成在唤醒测试场景下建议人工嘴声音源位于语音模块麦克风正前方水平直线距离3米人工嘴声音源距离地面1.5米。语音模块ESP32-S3和声压计位于同一垂直方向声压计在语音模块ESP32-S3正上方75厘米处。噪声源在斜45度方向距地高度1.2米距离语音模块ESP32-S32米。
.. figure:: ../../_static/test_reference_position2.png
:align: center
:alt: overview
识别测试
~~~~~~~~
识别率测试是指当设备处于识别状态时成功识别词表里包含的命令词的概率。
除通用因素外,通常识别率测试还需要确定的因素如下表所示。同唤醒率测试一样,识别率测试也可以根据产品定位去设计多样的测试场景。
+--------------+----------+----------+----------+----------+----------+----------+------+
| 测试场景编号 | 外噪距离 | 外噪角度 | 外噪分贝 | 人声距离 | 人声角度 | 人声分贝 | SNR |
+==============+==========+==========+==========+==========+==========+==========+======+
| 1 | / | / | <35dBA | 3m | 90° | 54dBA | / |
+--------------+----------+----------+----------+----------+----------+----------+------+
| 2 | 2m | 45° | 45dBA | 3m | 90° | 54dBA | 9dB |
+--------------+----------+----------+----------+----------+----------+----------+------+
| 3 | 2m | 45° | 55dBA | 3m | 90° | 59dBA | 4dB |
+--------------+----------+----------+----------+----------+----------+----------+------+
| 4 | 2m | 45° | 65dBA | 3m | 90° | 64dBA | -1dB |
+--------------+----------+----------+----------+----------+----------+----------+------+
误唤醒测试
~~~~~~~~~~
误唤醒率测试是指设备在产品定义的应用场景下被非唤醒词成功唤醒的概率。需要根据产品定义的应用场景中,设备可能处于的环境来设计误唤醒的测试场景,比如在家居应用场景 中,设备可能处于安静、外噪、设备自噪等环境。
除通用因素外,通常误唤醒率测试还需要确定的因素如下表所示。误唤醒率一般采用的衡量单位为次/小时。
+--------------+----------+----------+----------+----------+----------+
| 测试场景编号 | 噪声类型 | 噪声距离 | 噪声角度 | 噪声分贝 | 测试时长 |
+==============+==========+==========+==========+==========+==========+
| 1 | 安静 | / | / | <35dBA | 24小时 |
+--------------+----------+----------+----------+----------+----------+
| 2 | 白噪声 | 2m | 45° | 65dBA | 24小时 |
+--------------+----------+----------+----------+----------+----------+
| 3 | 新闻 | 2m | 45° | 65dBA | 24小时 |
+--------------+----------+----------+----------+----------+----------+
| 4 | 酒吧 | 2m | 45° | <65dBA | 24小时 |
+--------------+----------+----------+----------+----------+----------+
唤醒打断率测试
~~~~~~~~~~~~~~
对于有 AEC 功能的产品,通常还需要测试唤醒打断率。唤醒打断率是指设备有自噪时, 即有 TTS3 播报或播放音频时,被唤醒成功的概率。
除通用因素外,通常唤醒打断率测试还需要确定的因素如下表所示。
+--------------+--------------+----------+----------+----------+----------+
| 测试场景编号 | 设备自噪类型 | 噪声分贝 | 人声距离 | 人声角度 | 人声分贝 |
+==============+==============+==========+==========+==========+==========+
| 1 | 音乐 | 65dB | 3米 | 90° | 64dB |
+--------------+--------------+----------+----------+----------+----------+
| 2 | TTS | 65dB | 3米 | 90° | 64dB |
+--------------+--------------+----------+----------+----------+----------+
响应时间测试
~~~~~~~~~~~~
搭建好测试环境,打开语音录制工具,播放测试机,播报完毕后,利用语音录制工具计算出语音指令与播报之间的时间间隔即为响应时间。
步骤:
#. 利用人工嘴播放测试集。
#. 记录测试数据。
#. 计算相应时间。
乐鑫语音测试结果
~~~~~~~~~~~~~~~~
唤醒率测试
-----------
+----------------+------------+-------------+-----------+-----------+-----------+--------+--------+
| 测试项 | 环境噪声 | 噪声指标 | 信噪比SNR | 角度 | 距离 | 唤醒率 | 识别率 |
+================+============+=============+===========+===========+===========+========+========+
| 本地唤醒率测试 | 安静 | 人声59dBA | NA | 人声90° | 人声3米 | 99% | 91.5% |
| | | | | | | | |
| | | 噪声NA | | 噪声45° | 噪声2米 | | |
| +------------+-------------+-----------+ | +--------+--------+
| | 白噪声 | 人声59dBA | ≥4dBA | | | 99% | 78.25% |
| | | | | | | | |
| | | 噪声55dBA | | | | | |
| +------------+-------------+-----------+ | +--------+--------+
| | 人声类噪声 | 人声59dBA | ≥4dBA | | | 99% | 82.77% |
| | | | | | | | |
| | | 噪声55dBA | | | | | |
+----------------+------------+-------------+-----------+-----------+-----------+--------+--------+
误唤醒测试
-----------
+------------+----------+-------------+----------+------------+
| 测试项 | 环境噪声 | 噪声指标 | 测试时间 | 误唤醒次数 |
+============+==========+=============+==========+============+
| 误唤醒测试 | 音乐 | 噪声55dBA | 12h | 1 |
| +----------+-------------+----------+------------+
| | 新闻 | 噪声55dBA | 12h | 1 |
+------------+----------+-------------+----------+------------+
唤醒打断率测试
--------------
+----------------+----------+---------------+-----------+--------+--------------+
| 测试项 | 环境噪声 | 噪声指标 | 信噪比SNR | 唤醒率 | 命令词识别率 |
+================+==========+===============+===========+========+==============+
| 唤醒打断率测试 | 音乐 | 人声59dBA | ≥ 10dBA | 100% | 96% |
| | | 噪声69dBA | | | |
| +----------+---------------+-----------+--------+--------------+
| | TTS | 人声59dBA | ≥ 10dBA | 100% | 96% |
| | | 噪声69dBA | | | |
+----------------+----------+---------------+-----------+--------+--------------+
响应时间测试
------------
+--------------+----------+---------------+------------+----------+
| 测试项 | 环境噪声 | 噪声指标 | 信噪比 SNR | 响应时间 |
+==============+==========+===============+============+==========+
| 响应时间测试 | 安静 | 人声59dBA | NA | <500 ms |
| | | 噪声NA | | |
+--------------+----------+---------------+------------+----------+
.. figure:: ../../_static/test_response_time.png
:align: center
:alt: overview

View File

@ -0,0 +1,95 @@
乐鑫语音唤醒方案客户定制流程
=============================
:link_to_translation:`en:[English]`
离线唤醒词定制服务
-------------------
乐鑫提供 离线语音唤醒词 定制服务,详情如下:
#. “HI乐鑫“”你好小鑫” 等官方公开的唤醒词,客户可直接商用
- 如 ADFASR Demo 提供的离线命令词,同时乐鑫会逐渐开放更多的商用 Free 关键词
#. 除官方开放的唤醒词,可接受客户定制服务,分如下两种情况
- 如果客户提供 唤醒词语料
- 需要提供大于 2 万条合格的语料(语料需求见下文)
- 语料提供给乐鑫后,需要 23 周进行模型训练及调优
- 根据量级收取少量模型定制费用
- 如果客户不提供唤醒词语料
- 所有训练语料由乐鑫采集提供
- 语料提供给乐鑫后,需要 23 周进行模型训练及调优
- 根据量级收取少量模型定制费用(语料采集费用另收)
- 费用收取具体定价和定制时间,烦请邮件至 sales@espressif.com 协议商定
- 收费取决于 **唤醒词定制的数量** 以及 **产品量产数量**
#. 对于乐鑫唤醒词模型:
- 目前单个模型最多支持5个及以内的唤醒词识别
- 每个唤醒词通常由 3-6 音节组成比如“hi乐鑫”“Alexa”“小爱同学”“你好天猫”等
- 可多个唤醒模型一起使用,具体需根据客户应用的资源消耗确定
训练语料要求
------------
客户可自备训练语料或向第三方采购,对于语料有以下要求
- 语料音频格式要求
- 采样率sample rate16 KHz
- 编码encoding16-bit signed int
- 通道数channelmono
- 格式wav
- 语料采集要求
- 采样人数:最好样本可以大于 500 人,其中男女,年龄分布均衡,儿童不小于 100 人
- 采样环境:环境噪声低(< 40 dB建议在语音室等专业环境下录制
- 录制场景:距离麦克风 1 m 处每人录制 15 遍,其中 5 遍快语速5 遍正常语速5 遍慢语速;距离麦克风 3 m 处每人录制 15 遍,其中 5 遍快语速5 遍正常语速5 遍慢语速
- 录制设备:高保真麦克风
- 样本命名需体现样本信息:如 female_age_fast_id.wav 或有单独表格记录每个样本的年龄,性别等信息
硬件设计与测试
--------------
语音唤醒效果与硬件设计以及腔体结构有很大关系,为确保硬件设备设计合理,请认真阅读以下内容
#. 硬件设计要求
- 对于各类语音音箱类设计,乐鑫可提供 原理图PCB 等设计参考客户可以根据自身具体需求设计修改设计完毕后乐鑫可提供Review服务避免常见设计问题。
- 腔体结构,最好有专门的声学人员参与设计,乐鑫不提供 ID 设计类的参考,客户可以市场上主流音箱设计为参考
- 例如:天猫精灵、小度音箱、谷歌音箱等
#. 硬件设计好后,客户可通过以下简单测试,验证硬件设计效果(下列测试都是基于语音室环境,客户可以根据自身测试环境做调整)
- 录音测试,验证 MIC、codec 录音增益以及失真情况
- 音源 90 dB距离 0.1 m 播放样本,调节增益,保证录音样本不饱和
- 使用扫频文件0~20 KHz使用 16 KHz 采样率录音,音频不会出现明显频率混叠
- 录制 100 个语音样本,使用公开的云端语音识别端口识别,识别率达到指定标准
- 播音测试,验证 功率放大器PA、喇叭的失真情况
- 测试PA功率 @1% 总谐波失真THD
- 语音算法测试,验证 AEC、BFM、NS 效果
- 首先需要注意下参考信号延时,不同的 AEC 算法有不同的要求
- 以实际产品场景为测试指标,例如 MIC 播放 85DB-90DB 大梦想家.wav ,设备回采
- 保存回声参考信号、回声消除后的信号分析,对比查看 AEC、NS、BFM 等效果
- DSP性能测试验证DSP参数是否合适同时尽可能减少DSP算法中的非线性失真
- 降噪(Noise suppression)算法性能测试
- 回声消除(Acoustic Echo Cancellation)算法性能测试
- 语音增强(Speech Enhancement)算法性能测试
#. 硬件设计完毕后, **可寄送** 1-2 台硬件至乐鑫,乐鑫会基于客户整机做唤醒词性能调优

View File

@ -0,0 +1,107 @@
WakeNet
========
:link_to_translation:`en:[English]`
WakeNet是一个基于神经网络为低功耗嵌入式MCU设计的的唤醒词模型目前支持5个以内的唤醒词识别。
Overview
--------
WakeNet的流程图如下
.. figure:: ../../_static/wakenet_workflow.png
:alt: overview
.. raw:: html
<center>
.. raw:: html
</center>
- Speech Features
我们使用 `MFCC <https://en.wikipedia.org/wiki/Mel-frequency_cepstrum>`__ 方法提取语音频谱特征。输入的音频文件采样率为16KHz单声道编码方式为signed 16-bit。每帧窗宽和步长均为30ms。
.. only:: latex
.. figure:: ../../_static/QR_MFCC.png
:alt: overview
- Neural Network
神经网络结构已经更新到第9版其中
- wakeNet1,wakeNet2,wakeNet3,wakeNet4已经停止使用。
- wakeNet5应用于ESP32芯片。
- wakeNet8和wakeNet9应用于ESP32S3芯片模型基于 `Dilated Convolution <https://arxiv.org/pdf/1609.03499.pdf>`__ 结构。
.. only:: latex
.. figure:: ../../_static/QR_Dilated_Convolution.png
:alt: overview
注意WakeNet5,WakeNet5X2 和 WakeNet5X3 的网络结构一致,但是 WakeNet5X2 和 WakeNet5X3 的参数比 WakeNet5 要多。请参考 `性能测试 <#性能测试>`__ 来获取更多细节。
- Keyword Trigger Method
对连续的音频流为准确判断关键词的触发我们通过计算若干帧内识别结果的平均值M来判断触发。当M大于大于指定阈值发出触发的命令。
以下表格展示在不同芯片上的模型支持:
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| Chip | ESP32 | ESP32S3 |
+=================+===========+=============+=============+===========+===========+===========+===========+
| model | WakeNet 5 | WakeNet 8 | WakeNet 9 |
| +-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| | WakeNet 5 | WakeNet 5X2 | WakeNet 5X3 | Q16 | Q8 | Q16 | Q8 |
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| Hi,Lexin | √ | √ | √ | | | | √ |
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| nihaoxiaozhi | √ | | √ | | | | √ |
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| nihaoxiaoxin | | | √ | | | | |
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| xiaoaitongxue | | | | | | | √ |
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| Alexa | | | | √ | | | √ |
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| Hi,ESP | | | | | | | √ |
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
| Customized word | | | | | | | √ |
+-----------------+-----------+-------------+-------------+-----------+-----------+-----------+-----------+
WakeNet使用
-----------
- WakeNet 模型选择
WakeNet 模型选择请参考 `flash model 介绍 <../flash_model/README_CN.md>`__
自定义的唤醒词,请参考 `乐鑫语音唤醒词定制流程 <乐鑫语音唤醒词定制流程.md>`__
- WakeNet 运行
WakeNet 目前包含在语音前端算法 `AFE <../audio_front_end/README_CN.md>`__ 中,默认为运行状态,并将识别结果通过 AFE fetch 接口返回。
如果用户不需要初始化 WakeNet请在 AFE 配置时选择:
::
afe_config.wakenet_init = False.
如果用户想临时关闭/打开 WakeNet, 请在运行过程中调用:
::
afe_handle->disable_wakenet(afe_data)
afe_handle->enable_wakenet(afe_data)
性能测试
--------
具体请参考 `Performance Test <../performance_test/README.md>`__
唤醒词定制
----------
如果需要定制唤醒词,请参考 `乐鑫语音唤醒词定制流程 <乐鑫语音唤醒词定制流程.md>`__