Merge branch 'feat/esp32c5_aec' into 'master'

Feat/esp32c5 aec

See merge request speech-recognition-framework/esp-sr!136
This commit is contained in:
Sun Xiang Yu 2025-02-11 20:48:29 +08:00
commit 6b02042d4f
15 changed files with 344 additions and 25 deletions

View File

@ -1,8 +1,10 @@
# Change log for esp-sr
## Known issues:
- Available storage is less than the remaining flash space on IDF v5.0.
If you can not map model partition successfully, please check the left free storage by `spi_flash_mmap_get_free_pages(ESP_PARTITION_MMAP_DATA)` or update IDF to v5.1 or later.
## 2.0.0
- Add vadnet1_medium model
- Refactor AFE interface. Note AFE v2.0 is not compatible with previous versions
- Add esp32c5 AEC support
- Add some new wake words
## 1.9.5
- Add Hi,Jason; 小鸭小鸭; 璃奈板 wake word models

View File

@ -95,6 +95,24 @@ if((${IDF_TARGET} STREQUAL "esp32s3") OR (${IDF_TARGET} STREQUAL "esp32p4") OR (
endif()
endif()
elseif(${IDF_TARGET} STREQUAL "esp32c5")
set(srcs
"lib/${IDF_TARGET}/dummy.c"
)
set(include_dirs
"include/${IDF_TARGET}"
)
idf_component_register(SRCS ${srcs}
INCLUDE_DIRS ${include_dirs}
)
component_compile_options(-ffast-math -O3 -Wno-error=format=-Wno-format)
add_prebuilt_library(esp_audio_processor "${CMAKE_CURRENT_SOURCE_DIR}/lib/${IDF_TARGET}/libesp_audio_processor.a")
target_link_libraries(${COMPONENT_LIB} PRIVATE esp_audio_processor)
elseif((${IDF_TARGET} STREQUAL "esp32s2") OR (${IDF_TARGET} STREQUAL "esp32c3") OR (${IDF_TARGET} STREQUAL "esp32c6"))
#Only support TTS on esp32s2, esp32c3 and esp32c6

View File

@ -67,15 +67,17 @@ The ``input_format`` parameter specifies the arrangement of audio channels in th
+-----------+---------------------+
**Example:**
- ``"MMNR"``: Indicates four channels: two microphone channels, one unused channel, and one playback reference channel.
``"MMNR"`` Indicates four channels: two microphone channels, one unused channel, and one playback reference channel.
**Key Points:**
- The input data must be arranged in **channel-interleaved format**.
.. note::
The input data must be arranged in **channel-interleaved format**.
Using the AFE Framework
----------------------------
Based on the ``menuconfig`` -> ``ESP Speech Recognition``, select the required AFE (Analog Front End) models, such as the WakeNet model, VAD (Voice Activity Detection) model, NS (Noise Suppression) model, etc., and then call the AFE framework in the code using the following steps.
For reference, you can check the code in :project_file:`test_apps/esp-sr/main/test_afe.cpp`.
Step 1: Initialize AFE Configuration
@ -88,10 +90,10 @@ Get the default configuration using ``afe_config_init()`` and customize paramete
srmodel_list_t *models = esp_srmodel_init("model");
afe_config_t *afe_config = afe_config_init("MMNR", models, AFE_TYPE_SR, AFE_MODE_HIGH_PERF);
- **``input_format``**: Define the channel arrangement (e.g., ``"MMNR"``).
- **``models``**: List of models (e.g., for NS, VAD, or WakeNet).
- **``afe_type``**: Type of AFE (e.g., ``AFE_TYPE_SR`` for speech recognition).
- **``afe_mode``**: Performance mode (e.g., ``AFE_MODE_HIGH_PERF``).
- ``input_format``: Define the channel arrangement (e.g., ``MMNR``).
- ``models``: List of models (e.g., for NS, VAD, or WakeNet).
- ``afe_type``: Type of AFE (e.g., ``AFE_TYPE_SR`` for speech recognition).
- ``afe_mode``: Performance mode (e.g., ``AFE_MODE_HIGH_PERF``).
Step 2: Create AFE Instance
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -117,9 +119,9 @@ Input audio data to the AFE for processing. The input data must match the ``inpu
int16_t *feed_buff = (int16_t *) malloc(feed_chunksize * feed_nch * sizeof(int16_t));
afe_handle->feed(afe_data, feed_buff);
- **``feed_chunksize``**: Number of samples to feed per frame.
- **``feed_nch``**: Number of channel of input data.
- **``feed_buff``**: Channel-interleaved audio data (16-bit signed, 16 kHz).
- ``feed_chunksize``: Number of samples to feed per frame.
- ``feed_nch``: Number of channel of input data.
- ``feed_buff``: Channel-interleaved audio data (16-bit signed, 16 kHz).
Step 4: Fetch Processed Audio
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

View File

@ -51,7 +51,7 @@ Resource Consumption
- **MMNR:** two microphone channels and one playback channels
- **Models:** nsnet2, vadnet1_medium, wn9_hilexin
.. list-table:: ESP32-S3 AFE configuration and Performance
.. list-table:: AFE configuration and Performance
:widths: 25 15 15 20 20
:header-rows: 1

View File

@ -64,10 +64,11 @@ AFE 声学前端算法框架
+-----------+---------------------+
**示例:**
- ``"MMNR"``:表示四通道排列,包含两个麦克风通道、一个未使用通道和一个播放参考通道。
``"MMNR"``:表示四通道排列,包含两个麦克风通道、一个未使用通道和一个播放参考通道。
**关键点:**
- 输入数据必须采用 **通道交错排列格式**
.. note::
输入数据必须采用 **通道交错排列格式**
使用AFE框架
----------------------------
@ -84,10 +85,10 @@ AFE 声学前端算法框架
srmodel_list_t *models = esp_srmodel_init("model");
afe_config_t *afe_config = afe_config_init("MMNR", models, AFE_TYPE_SR, AFE_MODE_HIGH_PERF);
- **``input_format``**:定义通道排列(如 ``"MMNR"``)。
- **``models``**模型列表如NS、VAD或WakeNet模型
- **``afe_type``**AFE类型``AFE_TYPE_SR`` 表示语音识别场景)。
- **``afe_mode``**:性能模式(如 ``AFE_MODE_HIGH_PERF`` 表示高性能模式)。
- ``input_format``:定义通道排列(如 ``MMNR``)。
- ``models``模型列表如NS、VAD或WakeNet模型
- ``afe_type``AFE类型``AFE_TYPE_SR`` 表示语音识别场景)。
- ``afe_mode``:性能模式(如 ``AFE_MODE_HIGH_PERF`` 表示高性能模式)。
步骤2创建AFE实例
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@ -113,9 +114,9 @@ AFE 声学前端算法框架
int16_t *feed_buff = (int16_t *) malloc(feed_chunksize * feed_nch * sizeof(int16_t));
afe_handle->feed(afe_data, feed_buff);
- **``feed_chunksize``**:每帧输入的样本数。
- **``feed_nch``**:输入数据的通道数。
- **``feed_buff``**通道交错的音频数据16位有符号16 kHz
- ``feed_chunksize``:每帧输入的样本数。
- ``feed_nch``:输入数据的通道数。
- ``feed_buff``通道交错的音频数据16位有符号16 kHz
步骤4获取处理结果
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

View File

@ -1,4 +1,4 @@
version: "2.0.0~1-rc.2"
version: "2.0.0~1-rc.3"
description: esp_sr provides basic algorithms for Speech Recognition applications
url: https://github.com/espressif/esp-sr
dependencies:

105
include/esp32c5/esp_aec.h Normal file
View File

@ -0,0 +1,105 @@
// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License
#ifndef _ESP_AEC_H_
#define _ESP_AEC_H_
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
#define USE_AEC_FFT // Not kiss_fft
#define AEC_SAMPLE_RATE 16000 // Only Support 16000Hz
#define AEC_FRAME_LENGTH_MS 32
typedef struct aec_handle_t aec_handle_t;
typedef enum {
AEC_MODE_SR_LOW_COST = 0, // Low Cost AEC fro speech recognition
AEC_MODE_SR_HIGH_PERF = 1, // High Perforamce AEC for speech recognition
AEC_MODE_VOIP_LOW_COST = 3, // Low Cost AEC for voice communication
AEC_MODE_VOIP_HIGH_PERF = 4, // High Perforamce AEC for voice communication
} aec_mode_t;
/**
* @brief Creates an instance to the AEC structure.
* Please get frame size by aec_get_chunksize() function
*
* @param sample_rate The Sampling frequency (Hz) must be 16000.
* @param filter_length Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
* @param channel_num The input microphone channel number
* @param mode The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
* @return
* - NULL: Create failed
* - Others: The instance of AEC
*/
aec_handle_t *aec_create(int sample_rate, int filter_length, int channel_num, aec_mode_t mode);
/**
* @brief Creates an instance to the AEC structure, same with aec_create().
*
* @param filter_length Number of filter, recommend to set 4. The larger the filter_length, the more resource consumption.
* @param channel_num The input microphone channel number
* @param mode The mode of AEC, recommend to set AEC_MODE_SR_LOW_COST
* @return
* - NULL: Create failed
* - Others: The instance of AEC
*/
aec_handle_t *aec_pro_create(int filter_length, int channel_num, aec_mode_t mode);
/**
* @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
*
* @warning The indata, refdata and outdata must be 16-bit signed. please allocate memory by heap_caps_aligned_alloc().
*
* @param inst The instance of AEC. Format for multi-channel data is "ch0 ch0 ch0 ..., ch1 ch1 ch1 ..."
* @param indata An array of 16-bit signed audio samples from mic.
* @param refdata An array of 16-bit signed audio samples sent to the speaker.
* @param outdata Returns near-end signal with echo removed. Format for multi-channel data is "ch0 ch0 ch0..., ch1 ch1 ch1 ..."
* @return None
*
*/
void aec_process(const aec_handle_t *handel, int16_t *indata, int16_t *refdata, int16_t *outdata);
/**
* @brief Get frame size of AEC (the samples of one frame)
* @param handle The instance of AEC.
* @return Frame size
*/
int aec_get_chunksize(const aec_handle_t *handle);
/**
* @brief Get AEC mode string
*
* @param aec_mode The mode of AEC.
*
* @return AEC mode string
*/
char * aec_get_mode_string(aec_mode_t aec_mode);
/**
* @brief Free the AEC instance
*
* @param inst The instance of AEC.
*
* @return None
*
*/
void aec_destroy(aec_handle_t *handel);
#ifdef __cplusplus
}
#endif
#endif //_ESP_AEC_H_

0
lib/esp32c5/dummy.c Normal file
View File

Binary file not shown.

View File

@ -0,0 +1,10 @@
# This is the project CMakeLists.txt file for the test subproject
cmake_minimum_required(VERSION 3.5)
# Include the components directory of the main application:
#
set(EXTRA_COMPONENT_DIRS "$ENV{IDF_PATH}/tools/unit-test-app/components"
"../../../esp-sr")
include($ENV{IDF_PATH}/tools/cmake/project.cmake)
project(esp32c5_test)

View File

@ -0,0 +1,11 @@
set(srcs
"app_main.cpp"
"test_aec.cpp"
)
idf_component_register(SRCS ${srcs}
INCLUDE_DIRS "."
REQUIRES unity esp-sr esp_timer
WHOLE_ARCHIVE)

View File

@ -0,0 +1,47 @@
/* Example test application for testable component.
This example code is in the Public Domain (or CC0 licensed, at your option.)
Unless required by applicable law or agreed to in writing, this
software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
CONDITIONS OF ANY KIND, either express or implied.
*/
#include "unity.h"
#include <stdio.h>
#include <string.h>
extern "C" void app_main(void)
{
/* These are the different ways of running registered tests.
* In practice, only one of them is usually needed.
*
* UNITY_BEGIN() and UNITY_END() calls tell Unity to print a summary
* (number of tests executed/failed/ignored) of tests executed between these calls.
*/
// print_banner("Executing one test by its name");
// UNITY_BEGIN();
// unity_run_test_by_name("Mean of an empty array is zero");
// UNITY_END();
// print_banner("Running tests with [mean] tag");
// UNITY_BEGIN();
// unity_run_tests_by_tag("[mean]", false);
// UNITY_END();
// print_banner("Running tests without [fails] tag");
// UNITY_BEGIN();
// unity_run_tests_by_tag("[fails]", true);
// UNITY_END();
// print_banner("Running all the registered tests");
// UNITY_BEGIN();
// unity_run_all_tests();
// UNITY_END();
// print_banner("Starting interactive test menu");
/* This function will not return, and will be busy waiting for UART input.
* Make sure that task watchdog is disabled if you use this function.
*/
unity_run_menu();
}

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,70 @@
/* test_mean.c: Implementation of a testable component.
This example code is in the Public Domain (or CC0 licensed, at your option.)
Unless required by applicable law or agreed to in writing, this
software is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
CONDITIONS OF ANY KIND, either express or implied.
*/
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "freertos/FreeRTOS.h"
#include "freertos/task.h"
#include "esp_aec.h"
#include "audio_test_file.h"
#include "unity.h"
#include "esp_timer.h"
TEST_CASE("test esp32c5 aec", "[aec]")
{
// vad_handle_t vad_handle = (vad_handle_t)arg;
heap_caps_print_heap_info(MALLOC_CAP_8BIT);
int start_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
int start_internal_size = heap_caps_get_free_size(MALLOC_CAP_INTERNAL);
int sample_rate = 16000;
aec_handle_t *aec_handle = aec_create(sample_rate, 2, 1, AEC_MODE_SR_LOW_COST);
aec_destroy(aec_handle);
int first_end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
printf("memory leak for first init: %d\n", start_size - first_end_size);
aec_handle = aec_create(sample_rate, 2, 1, AEC_MODE_SR_LOW_COST);
int audio_chunksize = aec_get_chunksize(aec_handle);
printf("audio chunksize:%d\n", audio_chunksize); //512
int16_t *buffer = (int16_t *)malloc(audio_chunksize * sizeof(int16_t));
int16_t *ref_buffer = (int16_t *)malloc(audio_chunksize * sizeof(int16_t));
int16_t *out_buffer = (int16_t *)malloc(audio_chunksize * sizeof(int16_t));
int chunks = 0;
uint32_t c0, c1, c_res = 0;
while (1) {
if ((chunks + 1)*audio_chunksize * sizeof(int16_t) <= sizeof(audio_mic_file)) {
memcpy(buffer, audio_mic_file + chunks * audio_chunksize , audio_chunksize * sizeof(int16_t));
memcpy(ref_buffer, audio_ref_file + chunks * audio_chunksize , audio_chunksize * sizeof(int16_t));
} else {
break;
}
c0 = esp_timer_get_time();
aec_process(aec_handle, buffer, ref_buffer, out_buffer);
c1 = esp_timer_get_time();
c_res += c1 - c0;
chunks++;
}
free(buffer);
free(ref_buffer);
free(out_buffer);
printf("RAM size after vad detection: total:%d, internal:%d\n",
start_size - heap_caps_get_free_size(MALLOC_CAP_8BIT),
start_internal_size - heap_caps_get_free_size(MALLOC_CAP_INTERNAL));
printf("Done! Took %ld ms to parse %d ms worth of samples in %d iterations.\n",
c_res/1000, chunks*audio_chunksize*1000/sample_rate, chunks);
aec_destroy(aec_handle);
int end_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
printf("memory leak:%d\n", start_size-end_size);
TEST_ASSERT_EQUAL(true, end_size == start_size);
}

View File

@ -0,0 +1,6 @@
# This file was generated using idf.py save-defconfig. It can be edited manually.
# Espressif IoT Development Framework (ESP-IDF) 5.5.0 Project Minimal Configuration
#
CONFIG_IDF_TARGET="esp32c5"
CONFIG_ESP_MAIN_TASK_STACK_SIZE=148584
CONFIG_ESP_TASK_WDT_EN=n