Merge branch 'feat/add_nsnet1' into 'master'

feat: Add first noise suppression model (nsnet1) for ESP32-S3

See merge request speech-recognition-framework/esp-sr!77
This commit is contained in:
Sun Xiang Yu 2023-11-21 16:23:34 +08:00
commit c22423d2d4
22 changed files with 2183 additions and 48 deletions

View File

@ -4,6 +4,10 @@
- Available storage is less than the remaining flash space on IDF v5.0.
If you can not map model partition successfully, please check the left free storage by `spi_flash_mmap_get_free_pages(ESP_PARTITION_MMAP_DATA)` or update IDF to v5.1 or later.
## unreleased
- Add Chinese MultiNet7 models
- Add first Noise Suppression model: nsnet1
## 1.5.1
- Reduce Internal RAM of multinet7
- Update benchmark

View File

@ -93,6 +93,7 @@ elseif(${IDF_TARGET} STREQUAL "esp32s3")
esp_tts_chinese
voice_set_xiaole
wakenet
nsnet
"-Wl,--end-group")
set(MVMODEL_EXE ${COMPONENT_PATH}/model/movemodel.py)

View File

@ -29,6 +29,25 @@ choice AFE_INTERFACE_SEL
endchoice
config USE_NSNET
bool "use nsnet"
default "n"
choice SR_NSN_MODEL_LOAD
prompt "Select deep noise suppression"
default SR_NSN_NSNET1
depends on USE_NSNET
help
Select the deep noise suppression to be loaded.
config SR_NSN_NONE
bool "None"
config SR_NSN_NSNET1
bool "Deep noise suppression v1 (nsnet1)"
depends on IDF_TARGET_ESP32S3
endchoice
config USE_WAKENET
bool "use wakenet"

View File

@ -268,6 +268,10 @@ AEC 的使用和 WakeNet 相似,用户可以根据自己的需求来停止或
.pcm_config.mic_num = 2, \
// 配置音频参考回路通道数
.pcm_config.ref_num = 1, \
// 配置NS算法的模式NS_MODE_SSP为信号处理算法NS_MODE_NET为基于网络的降噪算法
.afe_ns_mode = NS_MODE_SSP, \
// 降噪网络的模型名字,默认为"nsnet1"
.afe_ns_model_name = "nsnet1", \
}

View File

@ -39,6 +39,12 @@ typedef struct {
int sample_rate; // sample rate of audio
} afe_pcm_config_t;
typedef enum {
NS_MODE_SSP = 0, // speech signal process method
NS_MODE_NET = 1, // deep noise suppression net method
} afe_ns_mode_t;
/**
* @brief Function to get the debug audio data
*
@ -82,6 +88,8 @@ typedef struct {
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
bool debug_init;
afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX];
afe_ns_mode_t afe_ns_mode;
char *afe_ns_model_name;
} afe_config_t;
@ -111,6 +119,8 @@ typedef struct {
.pcm_config.sample_rate = 16000, \
.debug_init = false, \
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
.afe_ns_mode = NS_MODE_SSP, \
.afe_ns_model_name = NULL, \
}
#elif CONFIG_IDF_TARGET_ESP32S3
#define AFE_CONFIG_DEFAULT() { \
@ -138,6 +148,8 @@ typedef struct {
.pcm_config.sample_rate = 16000, \
.debug_init = false, \
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
.afe_ns_mode = NS_MODE_SSP, \
.afe_ns_model_name = NULL, \
}
#endif

View File

@ -0,0 +1,64 @@
#pragma once
#include "stdint.h"
//Opaque model data container
typedef struct esp_nsn_data_t esp_nsn_data_t;
/**
* @brief Easy function type to initialze a model instance
*
* @param model_name The name of the model instance
* @returns Handle to the model data
*/
typedef esp_nsn_data_t* (*esp_nsn_iface_op_create_t)(char *model_name);
/**
* @brief Get the amount of samples that need to be passed to the process function
*
* Every noise suppression model processes a certain number of samples at the same time. This function
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
*
* @param model The model object to query
* @return The amount of samples to feed the process function
*/
typedef int (*esp_nsn_iface_op_get_samp_chunksize_t)(esp_nsn_data_t *model);
/**
* @brief Feed samples of an audio stream to the noise suppression model and get data after process.
*
*
* @param model The model object to query
* @param in_data An array of 16-bit signed audio samples. The array size used can be queried by the
* get_samp_chunksize function.
* @param out_data An array of 16-bit signed audio samples after process.
* @return The state of return.
*/
typedef int (*esp_nsn_iface_op_process_t)(esp_nsn_data_t *model, int16_t *in_data, int16_t *out_data);
/**
* @brief Get the sample rate of the samples to feed to the process function
*
* @param model The model object to query
* @return The sample rate, in hz
*/
typedef int (*esp_nsn_iface_op_get_samp_rate_t)(esp_nsn_data_t *model);
/**
* @brief Destroy a noise suppression model
*
* @param model Model object to destroy
*/
typedef void (*esp_nsn_iface_op_destroy_t)(esp_nsn_data_t *model);
/**
* This structure contains the functions used to do operations on a wake word detection model.
*/
typedef struct {
esp_nsn_iface_op_create_t create;
esp_nsn_iface_op_get_samp_chunksize_t get_samp_chunksize;
esp_nsn_iface_op_process_t process;
esp_nsn_iface_op_get_samp_rate_t get_samp_rate;
esp_nsn_iface_op_destroy_t destroy;
} esp_nsn_iface_t;

View File

@ -0,0 +1,9 @@
#pragma once
#include "esp_nsn_iface.h"
// The prefix of nsnet model name is used to filter all wakenet from availabel models.
#define ESP_NSNET_PREFIX "nsnet"
extern const esp_nsn_iface_t esp_nsnet1_quantized;
#define ESP_NSN_HANDLE esp_nsnet1_quantized

View File

@ -39,6 +39,12 @@ typedef struct {
int sample_rate; // sample rate of audio
} afe_pcm_config_t;
typedef enum {
NS_MODE_SSP = 0, // speech signal process method
NS_MODE_NET = 1, // deep noise suppression net method
} afe_ns_mode_t;
/**
* @brief Function to get the debug audio data
*
@ -82,6 +88,8 @@ typedef struct {
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
bool debug_init;
afe_debug_hook_t debug_hook[AFE_DEBUG_HOOK_MAX];
afe_ns_mode_t afe_ns_mode;
char *afe_ns_model_name;
} afe_config_t;
@ -111,6 +119,8 @@ typedef struct {
.pcm_config.sample_rate = 16000, \
.debug_init = false, \
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
.afe_ns_mode = NS_MODE_SSP, \
.afe_ns_model_name = NULL, \
}
#elif CONFIG_IDF_TARGET_ESP32S3
#define AFE_CONFIG_DEFAULT() { \
@ -138,6 +148,8 @@ typedef struct {
.pcm_config.sample_rate = 16000, \
.debug_init = false, \
.debug_hook = {{AFE_DEBUG_HOOK_MASE_TASK_IN, NULL}, {AFE_DEBUG_HOOK_FETCH_TASK_IN, NULL}}, \
.afe_ns_mode = NS_MODE_SSP, \
.afe_ns_model_name = NULL, \
}
#endif

View File

@ -0,0 +1,64 @@
#pragma once
#include "stdint.h"
//Opaque model data container
typedef struct esp_nsn_data_t esp_nsn_data_t;
/**
* @brief Easy function type to initialze a model instance
*
* @param model_name The name of the model instance
* @returns Handle to the model data
*/
typedef esp_nsn_data_t* (*esp_nsn_iface_op_create_t)(char *model_name);
/**
* @brief Get the amount of samples that need to be passed to the process function
*
* Every noise suppression model processes a certain number of samples at the same time. This function
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
*
* @param model The model object to query
* @return The amount of samples to feed the process function
*/
typedef int (*esp_nsn_iface_op_get_samp_chunksize_t)(esp_nsn_data_t *model);
/**
* @brief Feed samples of an audio stream to the noise suppression model and get data after process.
*
*
* @param model The model object to query
* @param in_data An array of 16-bit signed audio samples. The array size used can be queried by the
* get_samp_chunksize function.
* @param out_data An array of 16-bit signed audio samples after process.
* @return The state of return.
*/
typedef int (*esp_nsn_iface_op_process_t)(esp_nsn_data_t *model, int16_t *in_data, int16_t *out_data);
/**
* @brief Get the sample rate of the samples to feed to the process function
*
* @param model The model object to query
* @return The sample rate, in hz
*/
typedef int (*esp_nsn_iface_op_get_samp_rate_t)(esp_nsn_data_t *model);
/**
* @brief Destroy a noise suppression model
*
* @param model Model object to destroy
*/
typedef void (*esp_nsn_iface_op_destroy_t)(esp_nsn_data_t *model);
/**
* This structure contains the functions used to do operations on a wake word detection model.
*/
typedef struct {
esp_nsn_iface_op_create_t create;
esp_nsn_iface_op_get_samp_chunksize_t get_samp_chunksize;
esp_nsn_iface_op_process_t process;
esp_nsn_iface_op_get_samp_rate_t get_samp_rate;
esp_nsn_iface_op_destroy_t destroy;
} esp_nsn_iface_t;

View File

@ -0,0 +1,9 @@
#pragma once
#include "esp_nsn_iface.h"
// The prefix of nsnet model name is used to filter all wakenet from availabel models.
#define ESP_NSNET_PREFIX "nsnet"
extern const esp_nsn_iface_t esp_nsnet1_quantized;
#define ESP_NSN_HANDLE esp_nsnet1_quantized

Binary file not shown.

Binary file not shown.

Binary file not shown.

BIN
lib/esp32s3/libnsnet.a Normal file

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1,2 @@
# (neural network type)_(model data version)_ns_0_0.0_0.0
nsnet1_v1_ns_0_0.0_0.0

Binary file not shown.

Binary file not shown.

View File

@ -229,7 +229,8 @@ void srmodel_config_deinit(srmodel_list_t *models)
}
free(models);
}
models = NULL;
// models is static_srmodels
static_srmodels = NULL;
}
model_coeff_getter_t *srmodel_get_model_coeff(char *model_name)

View File

@ -20,6 +20,11 @@
#include "dl_lib_convq_queue.h"
#include <sys/time.h>
#ifdef CONFIG_IDF_TARGET_ESP32S3
#include "esp_nsn_models.h"
#include "esp_nsn_iface.h"
#endif
#define ARRAY_SIZE_OFFSET 8 // Increase this if audio_sys_get_real_time_stats returns ESP_ERR_INVALID_SIZE
#define AUDIO_SYS_TASKS_ELAPSED_TIME_MS 1000 // Period of stats measurement
@ -31,6 +36,9 @@ static int total_ram_size_before = 0;
static int internal_ram_size_before = 0;
static int psram_size_before = 0;
#if (CONFIG_FREERTOS_VTASKLIST_INCLUDE_COREID && CONFIG_FREERTOS_GENERATE_RUN_TIME_STATS)
const static char *task_state[] = {
"Running",
"Ready",
@ -43,9 +51,10 @@ const static char *task_state[] = {
* "Extr": Allocated task stack from psram, "Intr": Allocated task stack from internel
*/
const static char *task_stack[] = {"Extr", "Intr"};
#endif
TEST_CASE(">>>>>>>> audio_front_end SR create/destroy API & memory leak <<<<<<<<", "[afe]")
TEST_CASE(">>>>>>>> audio_front_end SR create/destroy API & memory leak <<<<<<<<", "[afe_sr]")
{
int audio_chunksize = 0;
int16_t *feed_buff = NULL;
@ -299,7 +308,7 @@ void test_print_cpuloading(void *arg)
vTaskDelete(NULL);
}
TEST_CASE("audio_front_end SR cpu loading and memory info", "[afe]")
TEST_CASE("audio_front_end SR cpu loading and memory info", "[afe_sr]")
{
srmodel_list_t *models = esp_srmodel_init("model");
if (models!=NULL) {
@ -346,7 +355,7 @@ TEST_CASE("audio_front_end SR cpu loading and memory info", "[afe]")
TEST_CASE("audio_front_end VC create/destroy API & memory leak", "[afe]")
TEST_CASE("audio_front_end VC create/destroy API & memory leak", "[afe_vc]")
{
int start_total_mem_size = 0;
int start_internal_mem_size = 0;
@ -367,68 +376,100 @@ TEST_CASE("audio_front_end VC create/destroy API & memory leak", "[afe]")
for (int se_init = 0; se_init < 2; se_init++) {
for (int vad_init = 0; vad_init < 2; vad_init++) {
for (int voice_communication_agc_init = 0; voice_communication_agc_init < 2; voice_communication_agc_init++) {
printf("aec_init: %d, se_init: %d, vad_init: %d, voice_communication_agc_init: %d\n", aec_init, se_init, vad_init, voice_communication_agc_init);
afe_config.aec_init = aec_init;
afe_config.se_init = se_init;
afe_config.vad_init = vad_init;
afe_config.voice_communication_agc_init = voice_communication_agc_init;
#ifdef CONFIG_IDF_TARGET_ESP32S3
for (int afe_ns_mode = 0; afe_ns_mode < 2; afe_ns_mode++) {
#else
int afe_ns_mode = NS_MODE_SSP;
#endif
printf("aec_init: %d, se_init: %d, vad_init: %d, voice_communication_agc_init: %d, afe_ns_mode: %d\n", aec_init, se_init, vad_init, voice_communication_agc_init, afe_ns_mode);
afe_config.aec_init = aec_init;
afe_config.se_init = se_init;
afe_config.vad_init = vad_init;
afe_config.voice_communication_agc_init = voice_communication_agc_init;
afe_config.afe_ns_mode = afe_ns_mode;
start_total_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
start_internal_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
start_spiram_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
//start_total_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
//start_internal_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
//start_spiram_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
for (int i = 0; i < 6; i++) {
printf("index: %d\n", i);
for (int i = 0; i < 2; i++) {
printf("index: %d\n", i);
vTaskDelay(500 / portTICK_PERIOD_MS);
start_total_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
start_internal_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
start_spiram_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
srmodel_list_t *models = esp_srmodel_init("model");
char *nsnet_name = NULL;
#ifdef CONFIG_IDF_TARGET_ESP32S3
nsnet_name = esp_srmodel_filter(models, ESP_NSNET_PREFIX, NULL);
#endif
printf("nsnet_name: %s\n", nsnet_name ? nsnet_name : "");
afe_config.afe_ns_model_name = nsnet_name;
afe_data = afe_handle->create_from_config(&afe_config);
if (!afe_data) {
printf("afe_data is null\n");
continue;
}
afe_data = afe_handle->create_from_config(&afe_config);
if (!afe_data) {
printf("afe_data is null\n");
continue;
audio_chunksize = afe_handle->get_feed_chunksize(afe_data);
feed_buff = malloc(audio_chunksize * sizeof(int16_t) * afe_config.pcm_config.total_ch_num);
assert(feed_buff);
afe_handle->feed(afe_data, feed_buff);
afe_handle->destroy(afe_data);
afe_data = NULL;
if (feed_buff) {
free(feed_buff);
feed_buff = NULL;
}
esp_srmodel_deinit(models);
vTaskDelay(1000 / portTICK_PERIOD_MS);
end_total_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
end_internal_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
end_spiram_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
printf("memory leak: %d\n", start_total_mem_size - end_total_mem_size);
if (i > 0) { // skip index = 0
TEST_ASSERT_EQUAL(start_internal_mem_size, end_internal_mem_size);
TEST_ASSERT_EQUAL(start_spiram_mem_size, end_spiram_mem_size);
TEST_ASSERT_EQUAL(start_total_mem_size, end_total_mem_size);
} else {
TEST_ASSERT_EQUAL(true, (start_total_mem_size - end_total_mem_size) < 1000);
}
}
#ifdef CONFIG_IDF_TARGET_ESP32S3
}
audio_chunksize = afe_handle->get_feed_chunksize(afe_data);
feed_buff = malloc(audio_chunksize * sizeof(int16_t) * afe_config.pcm_config.total_ch_num);
assert(feed_buff);
afe_handle->feed(afe_data, feed_buff);
afe_handle->destroy(afe_data);
afe_data = NULL;
if (feed_buff) {
free(feed_buff);
feed_buff = NULL;
}
vTaskDelay(100 / portTICK_PERIOD_MS);
end_total_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT);
end_internal_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
end_spiram_mem_size = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
printf("memory leak: %d\n", start_total_mem_size - end_total_mem_size);
if (i > 0) { // skip index = 0
TEST_ASSERT_EQUAL(start_internal_mem_size, end_internal_mem_size);
TEST_ASSERT_EQUAL(start_spiram_mem_size, end_spiram_mem_size);
TEST_ASSERT_EQUAL(start_total_mem_size, end_total_mem_size);
} else {
TEST_ASSERT_EQUAL(true, (start_total_mem_size - end_total_mem_size) < 1000);
}
}
#endif
}
}
}
}
}
TEST_CASE("audio_front_end VC cpu loading and memory info", "[afe]")
TEST_CASE("audio_front_end VC cpu loading and memory info", "[afe_vc]")
{
total_ram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT);
internal_ram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_INTERNAL);
psram_size_before = heap_caps_get_free_size(MALLOC_CAP_8BIT | MALLOC_CAP_SPIRAM);
srmodel_list_t *models = esp_srmodel_init("model");
char *nsnet_name = NULL;
#ifdef CONFIG_IDF_TARGET_ESP32S3
nsnet_name = esp_srmodel_filter(models, ESP_NSNET_PREFIX, NULL);
#endif
printf("nsnet_name: %s\n", nsnet_name ? nsnet_name : "");
esp_afe_sr_iface_t *afe_handle = (esp_afe_sr_iface_t *)&ESP_AFE_VC_HANDLE;
afe_config_t afe_config = AFE_CONFIG_DEFAULT();
afe_config.wakenet_init = false;
afe_config.voice_communication_init = true;
afe_config.voice_communication_agc_init = true;
#ifdef CONFIG_IDF_TARGET_ESP32S3
afe_config.afe_ns_mode = NS_MODE_NET;
#else
afe_config.afe_ns_mode = NS_MODE_SSP;
#endif
afe_config.afe_ns_model_name = nsnet_name;
afe_data = afe_handle->create_from_config(&afe_config);
if (!afe_data) {
@ -447,6 +488,7 @@ TEST_CASE("audio_front_end VC cpu loading and memory info", "[afe]")
vTaskDelay(2000 / portTICK_PERIOD_MS);
ESP_LOGI(TAG, "destroy\n");
afe_handle->destroy(afe_data);
esp_srmodel_deinit(models);
afe_data = NULL;
ESP_LOGI(TAG, "successful\n");
}

View File

@ -44,8 +44,23 @@ def test_wakenet(dut: Dut)-> None:
'wn9_hilexin',
],
)
def test_afe(dut: Dut)-> None:
def test_sr_afe(dut: Dut)-> None:
# dut.run_all_single_board_cases(group="afe")
dut.expect_exact('Press ENTER to see the list of tests.')
dut.write('[afe]')
dut.write('[afe_sr]')
dut.expect_unity_test_output(timeout = 1000)
@pytest.mark.target('esp32s3')
@pytest.mark.env('esp32s3')
@pytest.mark.parametrize(
'config',
[
'nsnet1',
],
)
def test_vc_afe(dut: Dut)-> None:
# dut.run_all_single_board_cases(group="afe")
dut.expect_exact('Press ENTER to see the list of tests.')
dut.write('[afe_vc]')
dut.expect_unity_test_output(timeout = 100000)

File diff suppressed because it is too large Load Diff