feature/Add NS/AEC/AGC/VAD algorithm

2025-09-15 15:28:44 +08:00 · 2019-08-19 17:09:21 +08:00 · 2019-08-19 17:09:21 +08:00 · 0f6442f84a
commit 0f6442f84a
parent c62aeb3826
16 changed files with 8066 additions and 2 deletions
--- a/1
+++ b/1
@ -5,6 +5,7 @@ MODULE_PATH := $(abspath $(shell pwd))
 EXTRA_COMPONENT_DIRS += $(MODULE_PATH)/lib
 EXTRA_COMPONENT_DIRS += $(MODULE_PATH)/wake_word_engine
 EXTRA_COMPONENT_DIRS += $(MODULE_PATH)/speech_command_recognition
+EXTRA_COMPONENT_DIRS += $(MODULE_PATH)/acoustic_algorithm

 include $(IDF_PATH)/make/project.mk

--- a/acoustic_algorithm/component.mk
+++ b/acoustic_algorithm/component.mk
@ -0,0 +1,11 @@
+COMPONENT_ADD_INCLUDEDIRS := include
+
+COMPONENT_SRCDIRS := .
+
+LIB_FILES := $(shell ls $(COMPONENT_PATH)/lib*.a)
+
+LIBS := $(patsubst lib%.a,-l%,$(notdir $(LIB_FILES)))
+
+COMPONENT_ADD_LDFLAGS +=  -L$(COMPONENT_PATH)/ $(LIBS)
+
+ALL_LIB_FILES += $(LIB_FILES)
--- a/acoustic_algorithm/include/esp_aec.h
+++ b/acoustic_algorithm/include/esp_aec.h
@ -0,0 +1,76 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_AEC_H_
+#define _ESP_AEC_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define USE_AEC_FFT                      // Not kiss_fft
+#define AEC_SAMPLE_RATE 16000            // Only Support 16000Hz
+#define AEC_FRAME_LENGTH_MS 16           // Only support 16ms
+#define AEC_FILTER_LENGTH 1200           // Number of samples of echo to cancel
+
+typedef void* aec_handle_t;
+
+/**
+ * @brief Creates an instance to the AEC structure.
+ *
+ * @param sample_rate       The Sampling frequency (Hz) can be 8000, 16000.
+ *
+ * @param frame_length      The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
+ *
+ * @param filter_length     Number of samples of echo to cancel.
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of AEC
+ */
+aec_handle_t aec_create(int sample_rate, int frame_length, int filter_length);
+
+
+/**
+ * @brief Performs echo cancellation a frame, based on the audio sent to the speaker and frame from mic.
+ *
+ * @param inst        The instance of AEC.
+ *
+ * @param indata      An array of 16-bit signed audio samples from mic.
+ *
+ * @param refdata     An array of 16-bit signed audio samples sent to the speaker.
+ *
+ * @param outdata     Returns near-end signal with echo removed.
+ *
+ * @return None
+ *
+ */
+void aec_process(aec_handle_t inst, int16_t *indata, int16_t *refdata, int16_t *outdata);
+
+/**
+ * @brief Free the AEC instance
+ *
+ * @param inst The instance of AEC.
+ *
+ * @return None
+ *
+ */
+void aec_destroy(aec_handle_t inst);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#endif //_ESP_AEC_H_
--- a/acoustic_algorithm/include/esp_agc.h
+++ b/acoustic_algorithm/include/esp_agc.h
@ -0,0 +1,31 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_AGC_H_
+#define _ESP_AGC_H_
+
+////all positive value is valid, negective is error
+typedef enum {
+    ESP_AGC_SUCCESS = 0,   ////success
+    ESP_AGC_FAIL = -1, ////agc fail
+    ESP_AGC_SAMPLE_RATE_ERROR = -2,  ///sample rate can be only 8khz, 16khz, 32khz
+    ESP_AGC_FRAME_SIZE_ERROR = -3,   ////the input frame size should be only 10ms, so should together with sample-rate to get the frame size
+} ESP_AGE_ERR;
+
+
+void *esp_agc_open(int agc_mode, int sample_rate);
+void set_agc_config(void *agc_handle, int gain_dB, int limiter_enable, int target_level_dbfs);
+int esp_agc_process(void *agc_handle, short *in_pcm, short *out_pcm, int frame_size, int sample_rate);
+void esp_agc_clse(void *agc_handle);
+
+#endif // _ESP_AGC_H_
--- a/acoustic_algorithm/include/esp_ns.h
+++ b/acoustic_algorithm/include/esp_ns.h
@ -0,0 +1,70 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_NS_H_
+#define _ESP_NS_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define NS_FRAME_LENGTH_MS 30          //Supports 10ms, 20ms, 30ms
+
+/**
+* The Sampling frequency (Hz) must be 16000Hz
+*/
+
+typedef void* ns_handle_t;
+
+/**
+ * @brief Creates an instance to the NS structure.
+ *
+ * @param frame_length_ms The length of the audio processing can be 10ms, 20ms, 30ms.
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of NS
+ */
+ns_handle_t ns_create(int frame_length_ms);
+
+/**
+ * @brief Feed samples of an audio stream to the NS and get the audio stream after Noise suppression.
+ *
+ * @param inst        The instance of NS.
+ *
+ * @param indata      An array of 16-bit signed audio samples.
+ *
+ * @param outdata     An array of 16-bit signed audio samples after noise suppression.
+ *
+ * @return None
+ *
+ */
+void ns_process(ns_handle_t inst, int16_t *indata, int16_t *outdata);
+
+/**
+ * @brief Free the NS instance
+ *
+ * @param inst The instance of NS.
+ *
+ * @return None
+ *
+ */
+void ns_destroy(ns_handle_t inst);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#endif //_ESP_NS_H_
--- a/acoustic_algorithm/include/esp_vad.h
+++ b/acoustic_algorithm/include/esp_vad.h
@ -0,0 +1,104 @@
+// Copyright 2015-2019 Espressif Systems (Shanghai) PTE LTD
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License
+#ifndef _ESP_VAD_H_
+#define _ESP_VAD_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define SAMPLE_RATE_HZ 16000      //Supports 32000, 16000, 8000
+#define VAD_FRAME_LENGTH_MS 30    //Supports 10ms, 20ms, 30ms
+
+/**
+ * @brief Sets the VAD operating mode. A more aggressive (higher mode) VAD is more
+ * restrictive in reporting speech.
+ */
+typedef enum {
+    VAD_MODE_0 = 0,
+    VAD_MODE_1,
+    VAD_MODE_2,
+    VAD_MODE_3,
+    VAD_MODE_4
+} vad_mode_t;
+
+typedef enum {
+    VAD_SILENCE = 0,
+    VAD_SPEECH
+} vad_state_t;
+
+typedef void* vad_handle_t;
+
+/**
+ * @brief Creates an instance to the VAD structure.
+ *
+ * @param vad_mode          Sets the VAD operating mode.
+ *
+ * @param sample_rate_hz    The Sampling frequency (Hz) can be 32000, 16000, 8000, default: 16000.
+ *
+ * @param one_frame_ms      The length of the audio processing can be 10ms, 20ms, 30ms, default: 30.
+ *
+ * @return
+ *         - NULL: Create failed
+ *         - Others: The instance of VAD
+ */
+vad_handle_t vad_create(vad_mode_t vad_mode, int sample_rate_hz, int one_frame_ms);
+
+/**
+ * @brief Feed samples of an audio stream to the VAD and check if there is someone speaking.
+ *
+ * @param inst      The instance of VAD.
+ *
+ * @param data      An array of 16-bit signed audio samples.
+ *
+ * @return
+ *         - VAD_SILENCE if no voice
+ *         - VAD_SPEECH  if voice is detected
+ *
+ */
+vad_state_t vad_process(vad_handle_t inst, int16_t *data);
+
+/**
+ * @brief Free the VAD instance
+ *
+ * @param inst The instance of VAD.
+ *
+ * @return None
+ *
+ */
+void vad_destroy(vad_handle_t inst);
+
+/*
+* Programming Guide:
+*
+* @code{c}
+* vad_handle_t vad_inst = vad_create(VAD_MODE_3, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);     // Creates an instance to the VAD structure.
+*
+* while (1) {
+*    //Use buffer to receive the audio data from MIC.
+*    vad_state_t vad_state = vad_process(vad_inst, buffer);      // Feed samples to the VAD process and get the result.
+* }
+*
+* vad_destroy(vad_inst);   // Free the VAD instance at the end of whole VAD process
+*
+* @endcode
+*/
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#endif //_ESP_VAD_H_
--- a/acoustic_algorithm/libesp_audio_processor.a
+++ b/acoustic_algorithm/libesp_audio_processor.a
--- a/main/audio_process.c
+++ b/main/audio_process.c
@ -0,0 +1,120 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "freertos/FreeRTOS.h"
+#include "freertos/task.h"
+
+#include "audio_process.h"
+#include "esp_ns.h"
+#include "esp_aec.h"
+#include "esp_agc.h"
+#include "esp_vad.h"
+#include "audio_test_file.h"
+
+void NSTask(void *arg)
+{
+    ns_handle_t ns_inst = ns_create(NS_FRAME_LENGTH_MS);
+    int chunks = 0;
+    int audio_chunksize = NS_FRAME_LENGTH_MS * 16;
+    int16_t *ns_in = malloc(audio_chunksize * sizeof(int16_t));
+    int16_t *ns_out = malloc(audio_chunksize * sizeof(int16_t));
+    while (1) {
+        if ((chunks + 1) * audio_chunksize * sizeof(int16_t) <= sizeof(audio_test_file)) {
+            memcpy(ns_in, audio_test_file + chunks * audio_chunksize * sizeof(int16_t), audio_chunksize * sizeof(int16_t));
+        } else {
+            break;
+        }
+        ns_process(ns_inst, ns_in, ns_out);
+        chunks++;
+    }
+    ns_destroy(ns_inst);
+    free(ns_in);
+    free(ns_out);
+    printf("NS test successfully\n\n");
+    vTaskDelete(NULL);    
+}
+
+#define AGC_FRAME_BYTES     320
+void AGCTask(void *arg)
+{
+    void *agc_handle = esp_agc_open(3, 16000);
+    set_agc_config(agc_handle, 15, 1, -3);
+    int chunks = 0;
+    int16_t *agc_in  = malloc(AGC_FRAME_BYTES);
+    int16_t *agc_out = malloc(AGC_FRAME_BYTES);
+    while (1) {
+        if ((chunks + 1) * AGC_FRAME_BYTES <= sizeof(audio_test_file)) {
+            memcpy(agc_in, audio_test_file + chunks * AGC_FRAME_BYTES, AGC_FRAME_BYTES);
+        } else {
+            break;
+        }
+        esp_agc_process(agc_handle, agc_in, agc_out, AGC_FRAME_BYTES / 2, 16000);
+        chunks++;
+    }
+    esp_agc_clse(agc_handle);
+    free(agc_in);
+    free(agc_out);
+    printf("AGC test successfully\n\n");
+    vTaskDelete(NULL);    
+}
+
+void AECTask(void *arg)
+{
+    aec_handle_t aec_inst = aec_create(AEC_SAMPLE_RATE, AEC_FRAME_LENGTH_MS, AEC_FILTER_LENGTH);
+    int chunks = 0;
+    int audio_chunksize = AEC_FRAME_LENGTH_MS * 16;
+    int16_t *aec_in = malloc(audio_chunksize * sizeof(int16_t));
+    int16_t *aec_ref = malloc(audio_chunksize * sizeof(int16_t));
+    int16_t *aec_out = malloc(audio_chunksize * sizeof(int16_t));
+    while (1) {
+        if ((chunks + 1) * audio_chunksize * sizeof(int16_t) <= sizeof(audio_test_file)) {
+            memcpy(aec_in, audio_test_file + chunks * audio_chunksize * sizeof(int16_t), audio_chunksize * sizeof(int16_t));
+            memset(aec_ref, 0, audio_chunksize * sizeof(int16_t));
+        } else {
+            break;
+        }
+        aec_process(aec_inst, aec_in, aec_ref, aec_out);
+        chunks++;
+    }
+    aec_destroy(aec_inst);
+    free(aec_in);
+    free(aec_ref);
+    free(aec_out);
+    printf("AEC test successfully\n\n");
+    vTaskDelete(NULL);    
+}
+
+void VADTask(void *arg)
+{
+    vad_handle_t vad_inst = vad_create(VAD_MODE_4, SAMPLE_RATE_HZ, VAD_FRAME_LENGTH_MS);
+    int chunks = 0;
+    int audio_chunksize = VAD_FRAME_LENGTH_MS * 16;
+    int16_t *vad_in = malloc(audio_chunksize * sizeof(int16_t));
+    while (1) {
+        if ((chunks + 1) * audio_chunksize * sizeof(int16_t) <= sizeof(audio_test_file)) {
+            memcpy(vad_in, audio_test_file + chunks * audio_chunksize * sizeof(int16_t), audio_chunksize * sizeof(int16_t));
+        } else {
+            break;
+        }
+        vad_state_t vad_state = vad_process(vad_inst, vad_in);
+        chunks++;
+    }
+    vad_destroy(vad_inst);
+    free(vad_in);
+    printf("VAD test successfully\n\n");
+    printf("TEST3 FINISHED\n\n");
+    vTaskDelete(NULL);    
+}
+
+
+
+void audio_process_test()
+{
+    xTaskCreatePinnedToCore(&NSTask, "noise_suppression", 3 * 1024, NULL, 5, NULL, 1);
+    vTaskDelay(1000 / portTICK_PERIOD_MS);
+    xTaskCreatePinnedToCore(&AGCTask, "automatic_gain_control", 3 * 1024, NULL, 5, NULL, 1);
+    vTaskDelay(1000 / portTICK_PERIOD_MS);
+    xTaskCreatePinnedToCore(&AECTask, "acoustic_echo_cancellation", 3 * 1024, NULL, 5, NULL, 0);
+    vTaskDelay(1000 / portTICK_PERIOD_MS);
+    xTaskCreatePinnedToCore(&VADTask, "voice_activity_detection", 3 * 1024, NULL, 5, NULL, 0);
+}
--- a/main/include/audio_process.h
+++ b/main/include/audio_process.h
@ -0,0 +1,3 @@
+#pragma once
+
+void audio_process_test();
--- a/main/include/audio_test_file.h
+++ b/main/include/audio_test_file.h
--- a/main/include/dakaidiandeng.h
+++ b/main/include/dakaidiandeng.h
--- a/main/include/hilexin.h
+++ b/main/include/hilexin.h
--- a/main/include/multinet_test.h
+++ b/main/include/multinet_test.h
--- a/main/include/wakenet_test.h
+++ b/main/include/wakenet_test.h
--- a/main/main.c
+++ b/main/main.c
@ -8,6 +8,7 @@

 #include "wakenet_test.h"
 #include "multinet_test.h"
+#include "audio_process.h"

 void app_main()
 {
@ -15,6 +16,10 @@ void app_main()
    wakenet_test();
    vTaskDelay(3000 / portTICK_PERIOD_MS);

-    //test multinet
+    // test multinet
    multinet_test();
+    vTaskDelay(3000 / portTICK_PERIOD_MS);
+
+    // test acoustic algorithm
+    audio_process_test();
 }
--- a/main/wakenet_test.c
+++ b/main/wakenet_test.c
@ -41,7 +41,7 @@ void wakenetTask(void *arg)
    int tv_ms=(tv_end.tv_sec-tv_start.tv_sec)*1000+(tv_end.tv_usec-tv_start.tv_usec)/1000;
    printf("Done! Took %d ms to parse %d ms worth of samples in %d iterations. CPU loading(single core):%.1f%%\n", 
            tv_ms, chunks*30, chunks, tv_ms*1.0/chunks/3*10);
-    printf("TEST FINISHED\n\n");
+    printf("TEST1 FINISHED\n\n");
    vTaskDelete(NULL);
 }