diff --git a/Kconfig.projbuild b/Kconfig.projbuild index 6759c6b..641a67c 100644 --- a/Kconfig.projbuild +++ b/Kconfig.projbuild @@ -196,6 +196,11 @@ menu "Load Multiple Wake Words" config SR_WN_WN9_XIAOMINGTONGXUE_TTS2 bool "小明同学 (wn9_xiaomingtongxue_tts2)" default False + + + config SR_WN_WN9_HIWALLE_TTS2 + bool "Hi Wall E or Hi 瓦力(wn9_hiwalle_tts2)" + default False endmenu diff --git a/README.md b/README.md index 3b40e90..a2d2299 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,7 @@ The following wake words are supported in esp-sr: |Hey,Wand | | wn9_heywanda_tts | |Astrolabe | | wn9_astrolabe_tts | |Hi,Jason | | wn9_hijason_tts2 | +|Hi,Wall E/Hi,瓦力| | wn9_hiwalle_tts2 | |你好小鑫 | | wn9_nihaoxiaoxin_tts | |小美同学 | | wn9_xiaomeitongxue_tts | |Hi,小星 | | wn9_hixiaoxing_tts | diff --git a/include/esp32p4/esp_afe_config.h b/include/esp32p4/esp_afe_config.h index 00ac15b..5bb8311 100644 --- a/include/esp32p4/esp_afe_config.h +++ b/include/esp32p4/esp_afe_config.h @@ -126,7 +126,7 @@ typedef struct { afe_agc_mode_t agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain. int agc_compression_gain_db; // Compression gain in dB (default 9) - int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3) + int agc_target_level_dbfs; // Target level in -dBfs of envelope (default 3, means target level is -3 dBFS) /********** General AFE(Audio Front End) parameter **********/ afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function. diff --git a/model/wakenet_model/wn9_hiwalle_tts2/_MODEL_INFO_ b/model/wakenet_model/wn9_hiwalle_tts2/_MODEL_INFO_ new file mode 100644 index 0000000..42c5ebf --- /dev/null +++ b/model/wakenet_model/wn9_hiwalle_tts2/_MODEL_INFO_ @@ -0,0 +1 @@ +wakenet9l_tts2h12_Hi,Wall,E or Hi,瓦力_3_0.630_0.635 diff --git a/model/wakenet_model/wn9_hiwalle_tts2/wn9_data b/model/wakenet_model/wn9_hiwalle_tts2/wn9_data new file mode 100644 index 0000000..ae2aa4f Binary files /dev/null and b/model/wakenet_model/wn9_hiwalle_tts2/wn9_data differ diff --git a/model/wakenet_model/wn9_hiwalle_tts2/wn9_index b/model/wakenet_model/wn9_hiwalle_tts2/wn9_index new file mode 100644 index 0000000..5e7c881 Binary files /dev/null and b/model/wakenet_model/wn9_hiwalle_tts2/wn9_index differ diff --git a/test_apps/esp-sr/main/test_multinet.cpp b/test_apps/esp-sr/main/test_multinet.cpp index 9db0a1c..bbe5b08 100644 --- a/test_apps/esp-sr/main/test_multinet.cpp +++ b/test_apps/esp-sr/main/test_multinet.cpp @@ -126,82 +126,148 @@ TEST_CASE("multinet cpu loading", "[mn]") TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED); } -// TEST_CASE("multinet set commands and detect", "[mn]") -// { -// vTaskDelay(500 / portTICK_PERIOD_MS); -// srmodel_list_t *models = esp_srmodel_init("model"); -// char *model_name = esp_srmodel_filter(models, ESP_MN_PREFIX, NULL); -// esp_mn_iface_t *multinet = esp_mn_handle_from_name(model_name); +TEST_CASE("multinet set commands and detect", "[mn]") +{ + vTaskDelay(500 / portTICK_PERIOD_MS); + srmodel_list_t *models = esp_srmodel_init("model"); + char *model_name = esp_srmodel_filter(models, ESP_MN_PREFIX, NULL); + esp_mn_iface_t *multinet = esp_mn_handle_from_name(model_name); -// model_iface_data_t *model_data = multinet->create(model_name, 6000); -// int frequency = multinet->get_samp_rate(model_data); -// int audio_chunksize = multinet->get_samp_chunksize(model_data) * sizeof(int16_t); -// char *lang = multinet->get_language(model_data); -// esp_mn_commands_update_from_sdkconfig(multinet, model_data); -// unsigned char* data = NULL; -// size_t data_size = 0; -// if (strcmp(lang, ESP_MN_ENGLISH) == 0) { -// data = (unsigned char*)tell_me_a_joke; -// data_size = sizeof(tell_me_a_joke); -// printf("commands: tell me a joke, size:%d\n", data_size); -// } else if(strcmp(lang, ESP_MN_CHINESE) == 0) { -// data = (unsigned char*)da_kai_kong_tiao; -// data_size = sizeof(da_kai_kong_tiao); -// printf("commands: da kai kong tiao, size:%d\n", data_size); -// } - -// int16_t *buffer = (int16_t *) malloc(audio_chunksize); -// int chunks = 0; -// struct timeval tv_start, tv_end; -// gettimeofday(&tv_start, NULL); -// esp_mn_state_t mn_state; -// esp_mn_error_t *error_phrases = NULL; -// esp_mn_commands_clear(); + model_iface_data_t *model_data = multinet->create(model_name, 6000); + int frequency = multinet->get_samp_rate(model_data); + int audio_chunksize = multinet->get_samp_chunksize(model_data) * sizeof(int16_t); + char *lang = multinet->get_language(model_data); + esp_mn_commands_update_from_sdkconfig(multinet, model_data); + unsigned char* data = NULL; + size_t data_size = 0; + if (strcmp(lang, ESP_MN_ENGLISH) == 0) { + data = (unsigned char*)tell_me_a_joke; + data_size = sizeof(tell_me_a_joke); + printf("commands: tell me a joke, size:%d\n", data_size); + } else if(strcmp(lang, ESP_MN_CHINESE) == 0) { + data = (unsigned char*)da_kai_kong_tiao; + data_size = sizeof(da_kai_kong_tiao); + printf("commands: da kai kong tiao, size:%d\n", data_size); + } + + int16_t *buffer = (int16_t *) malloc(audio_chunksize); + int chunks = 0; + struct timeval tv_start, tv_end; + gettimeofday(&tv_start, NULL); + esp_mn_state_t mn_state; + esp_mn_error_t *error_phrases = NULL; + esp_mn_commands_clear(); -// if (strcmp(lang, ESP_MN_ENGLISH) == 0) { -// esp_mn_commands_phoneme_add(1, "TELL ME A JOKE", "TfL Mm c qbK"); -// esp_mn_commands_phoneme_add(2, "SING A SONG", "Sgl c Sel"); -// error_phrases = esp_mn_commands_update(); -// } else if(strcmp(lang, ESP_MN_CHINESE) == 0) { -// esp_mn_commands_add(1, "da kai kong tiao"); -// esp_mn_commands_add(2, "guan bi kong tiao"); -// error_phrases = esp_mn_commands_update(); -// } else { -// printf("Invalid language\n"); -// } -// multinet->print_active_speech_commands(model_data); + if (strcmp(lang, ESP_MN_ENGLISH) == 0) { + esp_mn_commands_phoneme_add(1, "TELL ME A JOKE", "TfL Mm c qbK"); + esp_mn_commands_phoneme_add(2, "SING A SONG", "Sgl c Sel"); + error_phrases = esp_mn_commands_update(); + } else if(strcmp(lang, ESP_MN_CHINESE) == 0) { + esp_mn_commands_add(1, "da kai kong tiao"); + esp_mn_commands_add(2, "guan bi kong tiao"); + error_phrases = esp_mn_commands_update(); + } else { + printf("Invalid language\n"); + } + multinet->print_active_speech_commands(model_data); + + while (1) { + if ((chunks + 1)*audio_chunksize <= data_size) { + memcpy(buffer, data + chunks * audio_chunksize, audio_chunksize); + } else { + memset(buffer, 0, audio_chunksize); + } + mn_state = multinet->detect(model_data, buffer); + if (mn_state == ESP_MN_STATE_DETECTED) { + esp_mn_results_t *mn_result = multinet->get_results(model_data); + if (mn_result->num > 0) + printf("detected: command id:%d, string:%s\n",mn_result->command_id[0], mn_result->string); + else + printf("timeout\n"); + break; + } + chunks++; + if (chunks > 600) + break; + } + gettimeofday(&tv_end, NULL); + int tv_ms=(tv_end.tv_sec-tv_start.tv_sec)*1000+(tv_end.tv_usec-tv_start.tv_usec)/1000; + chunks -= 7; + int run_ms = (chunks)*audio_chunksize/sizeof(int16_t)*1000/frequency; + printf("Done! Took %d ms to parse %d ms worth of samples in %d iterations. CPU loading(single core):%.1f%%\n", + tv_ms, run_ms, chunks, tv_ms*100.0/run_ms); + + multinet->destroy(model_data); + esp_srmodel_deinit(models); + TEST_ASSERT_EQUAL(true, error_phrases == NULL); + TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED); +} + +TEST_CASE("multinet set commands from sdkconfig and detect", "[mn]") +{ + vTaskDelay(500 / portTICK_PERIOD_MS); + srmodel_list_t *models = esp_srmodel_init("model"); + char *model_name = esp_srmodel_filter(models, ESP_MN_PREFIX, NULL); + esp_mn_iface_t *multinet = esp_mn_handle_from_name(model_name); + + model_iface_data_t *model_data = multinet->create(model_name, 6000); + int frequency = multinet->get_samp_rate(model_data); + int audio_chunksize = multinet->get_samp_chunksize(model_data) * sizeof(int16_t); + char *lang = multinet->get_language(model_data); + esp_mn_commands_update_from_sdkconfig(multinet, model_data); + unsigned char* data = NULL; + size_t data_size = 0; + if (strcmp(lang, ESP_MN_ENGLISH) == 0) { + data = (unsigned char*)tell_me_a_joke; + data_size = sizeof(tell_me_a_joke); + printf("commands: tell me a joke, size:%d\n", data_size); + } else if(strcmp(lang, ESP_MN_CHINESE) == 0) { + data = (unsigned char*)da_kai_kong_tiao; + data_size = sizeof(da_kai_kong_tiao); + printf("commands: da kai kong tiao, size:%d\n", data_size); + } + + int16_t *buffer = (int16_t *) malloc(audio_chunksize); + int chunks = 0; + struct timeval tv_start, tv_end; + gettimeofday(&tv_start, NULL); + esp_mn_state_t mn_state; + esp_mn_error_t *error_phrases = NULL; + esp_mn_commands_update_from_sdkconfig(multinet, model_data); + multinet->print_active_speech_commands(model_data); + + while (1) { + if ((chunks + 1)*audio_chunksize <= data_size) { + memcpy(buffer, data + chunks * audio_chunksize, audio_chunksize); + } else { + memset(buffer, 0, audio_chunksize); + } + mn_state = multinet->detect(model_data, buffer); + if (mn_state == ESP_MN_STATE_DETECTED) { + esp_mn_results_t *mn_result = multinet->get_results(model_data); + if (mn_result->num > 0) + printf("detected: command id:%d, string:%s\n",mn_result->command_id[0], mn_result->string); + else + printf("timeout\n"); + break; + } + chunks++; + if (chunks > 600) + break; + } + gettimeofday(&tv_end, NULL); + int tv_ms=(tv_end.tv_sec-tv_start.tv_sec)*1000+(tv_end.tv_usec-tv_start.tv_usec)/1000; + chunks -= 7; + int run_ms = (chunks)*audio_chunksize/sizeof(int16_t)*1000/frequency; + printf("Done! Took %d ms to parse %d ms worth of samples in %d iterations. CPU loading(single core):%.1f%%\n", + tv_ms, run_ms, chunks, tv_ms*100.0/run_ms); + + multinet->destroy(model_data); + esp_srmodel_deinit(models); + TEST_ASSERT_EQUAL(true, error_phrases == NULL); + TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED); +} -// while (1) { -// if ((chunks + 1)*audio_chunksize <= data_size) { -// memcpy(buffer, data + chunks * audio_chunksize, audio_chunksize); -// } else { -// memset(buffer, 0, audio_chunksize); -// } -// mn_state = multinet->detect(model_data, buffer); -// if (mn_state == ESP_MN_STATE_DETECTED) { -// esp_mn_results_t *mn_result = multinet->get_results(model_data); -// if (mn_result->num > 0) -// printf("detected: command id:%d, string:%s\n",mn_result->command_id[0], mn_result->string); -// else -// printf("timeout\n"); -// break; -// } -// chunks++; -// if (chunks > 600) -// break; -// } -// gettimeofday(&tv_end, NULL); -// int tv_ms=(tv_end.tv_sec-tv_start.tv_sec)*1000+(tv_end.tv_usec-tv_start.tv_usec)/1000; -// chunks -= 7; -// int run_ms = (chunks)*audio_chunksize/sizeof(int16_t)*1000/frequency; -// printf("Done! Took %d ms to parse %d ms worth of samples in %d iterations. CPU loading(single core):%.1f%%\n", -// tv_ms, run_ms, chunks, tv_ms*100.0/run_ms); - -// multinet->destroy(model_data); -// esp_srmodel_deinit(models); -// TEST_ASSERT_EQUAL(true, error_phrases == NULL); -// TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED); -// } TEST_CASE("multinet set commands", "[mn]") {