diff --git a/Kconfig.projbuild b/Kconfig.projbuild index 81b5d07..4197e08 100644 --- a/Kconfig.projbuild +++ b/Kconfig.projbuild @@ -105,7 +105,7 @@ choice SR_WN_MODEL_LOAD config SR_WN_WN9_HIMFIVE bool "Hi,M Five (wn9_himfive)" depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 - + config SR_WN_WN9_NIHAOXIAOZHI_TTS bool "你好小智 (wn9_nihaoxiaozhi_tts)" depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 @@ -186,6 +186,14 @@ choice SR_WN_MODEL_LOAD bool "Astrolabe (wn9_astrolabe_tts)" depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + config SR_WN_WN9_XIAOYAXIAOYA_TTS2 + bool "小鸭小鸭 (wn9_xiaoyaxiaoya_tts2)" + depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + + config SR_WN_WN9_HIJASON_TTS2 + bool "Hi,Jason (wn9_hijason_tts2)" + depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + config SR_WN_WN9_CUSTOMWORD bool "customized word (wn9_customword)" depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 @@ -293,6 +301,14 @@ menu "Load Multiple Wake Words" bool "Astrolabe (wn9_astrolabe_tts)" depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + config SR_WN_WN9_XIAOYAXIAOYA_TTS2_MULTI + bool "小鸭小鸭 (wn9_xiaoyaxiaoya_tts2)" + depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + + config SR_WN_WN9_HIJASON_TTS2_MULTI + bool "Hi,Jason (wn9_hijason_tts2)" + depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4 + endmenu config USE_MULTINET diff --git a/README.md b/README.md index 5e554fc..ef9b89f 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ The following wake words are supported in esp-sr: |Hi,Joy | | wn9_hijoy_tts | |Hey,Wand | | wn9_heywanda_tts | |Astrolabe | | wn9_astrolabe_tts | +|Hi,Jason | | wn9_hijason_tts2 | |你好小鑫 | | wn9_nihaoxiaoxin_tts | |小美同学 | | wn9_xiaomeitongxue_tts | |Hi,小星 | | wn9_hixiaoxing_tts | @@ -57,8 +58,9 @@ The following wake words are supported in esp-sr: |Hi,Telly/Hi,泰力 | | wn9_hitelly_tts | |小滨小滨/小冰小冰| | wn9_xiaobinxiaobin_tts | |Hi,小巫 | | wn9_haixiaowu_tts | +|小鸭小鸭 | | wn9_xiaoyaxiaoya_tts2 | -*NOTE:* `_tts` suffix means this WakeNet model is trained by TTS samples. +*NOTE:* `_tts` suffix means this WakeNet model is trained by TTS samples. `_tts2` suffix means this WakeNet model is trained by TTS Pipeline V2. ## Speech Command Recognition diff --git a/conftest.py b/conftest.py index 0220a9e..4c063b3 100644 --- a/conftest.py +++ b/conftest.py @@ -202,7 +202,7 @@ class IdfPytestEmbedded: for item in items: # default timeout 5 mins if 'timeout' not in item.keywords: - item.add_marker(pytest.mark.timeout(5 * 60)) + item.add_marker(pytest.mark.timeout(8 * 60)) # filter all the test cases with "--target" if self.target: diff --git a/model/movemodel.py b/model/movemodel.py index 1a84a82..b49aa8a 100644 --- a/model/movemodel.py +++ b/model/movemodel.py @@ -1,12 +1,14 @@ -import io -import os import argparse -import shutil +import io import math +import os +import shutil import sys + sys.dont_write_bytecode = True from pack_model import pack_models + def calculate_total_size(folder_path): total_size = 0 for file_name in os.listdir(folder_path): @@ -103,6 +105,26 @@ def copy_nsnet_from_sdkconfig(model_path, sdkconfig_path, target_path): for item in models: shutil.copytree(model_path + '/nsnet_model/' + item, target_path+'/'+item) +def copy_vadnet_from_sdkconfig(model_path, sdkconfig_path, target_path): + """ + Copy vadnet model from model_path to target_path based on sdkconfig + """ + with io.open(sdkconfig_path, "r") as f: + models_string = '' + for label in f: + label = label.strip("\n") + if 'CONFIG_SR_VADNET' in label and label[0] != '#': + models_string += label + + models = [] + if "CONFIG_SR_VADNET_MODLE_SMALL" in models_string: + models.append('vadnet1_small') + elif "CONFIG_SR_VADNET_MODLE_MEDIUM" in models_string: + models.append('vadnet1_medium') + + for item in models: + shutil.copytree(model_path + '/vadnet_model/' + item, target_path+'/'+item) + if __name__ == '__main__': parser = argparse.ArgumentParser(description='Model generator tool') parser.add_argument('-d1', '--sdkconfig_path') @@ -122,6 +144,7 @@ if __name__ == '__main__': copy_multinet_from_sdkconfig(model_path, sdkconfig_path, target_path) copy_wakenet_from_sdkconfig(model_path, sdkconfig_path, target_path) copy_nsnet_from_sdkconfig(model_path, sdkconfig_path, target_path) + copy_vadnet_from_sdkconfig(model_path, sdkconfig_path, target_path) pack_models(target_path, image_file) total_size = os.path.getsize(os.path.join(target_path, image_file)) recommended_size = int(math.ceil(total_size/1024)) diff --git a/model/wakenet_model/wn9_hijason_tts2/_MODEL_INFO_ b/model/wakenet_model/wn9_hijason_tts2/_MODEL_INFO_ new file mode 100644 index 0000000..78e1ddb --- /dev/null +++ b/model/wakenet_model/wn9_hijason_tts2/_MODEL_INFO_ @@ -0,0 +1 @@ +wakenet9l_tts2h12_Hi,Jason_3_0.642_0.645 diff --git a/model/wakenet_model/wn9_hijason_tts2/wn9_data b/model/wakenet_model/wn9_hijason_tts2/wn9_data new file mode 100644 index 0000000..3947d65 Binary files /dev/null and b/model/wakenet_model/wn9_hijason_tts2/wn9_data differ diff --git a/model/wakenet_model/wn9_hijason_tts2/wn9_index b/model/wakenet_model/wn9_hijason_tts2/wn9_index new file mode 100644 index 0000000..5e7c881 Binary files /dev/null and b/model/wakenet_model/wn9_hijason_tts2/wn9_index differ diff --git a/model/wakenet_model/wn9_xiaoyaxiaoya_tts2/_MODEL_INFO_ b/model/wakenet_model/wn9_xiaoyaxiaoya_tts2/_MODEL_INFO_ new file mode 100644 index 0000000..5fadbc2 --- /dev/null +++ b/model/wakenet_model/wn9_xiaoyaxiaoya_tts2/_MODEL_INFO_ @@ -0,0 +1 @@ +wakenet9_tts2h12_小鸭小鸭_3_0.595_0.600 diff --git a/model/wakenet_model/wn9_xiaoyaxiaoya_tts2/wn9_data b/model/wakenet_model/wn9_xiaoyaxiaoya_tts2/wn9_data new file mode 100644 index 0000000..dff2ee4 Binary files /dev/null and b/model/wakenet_model/wn9_xiaoyaxiaoya_tts2/wn9_data differ diff --git a/model/wakenet_model/wn9_xiaoyaxiaoya_tts2/wn9_index b/model/wakenet_model/wn9_xiaoyaxiaoya_tts2/wn9_index new file mode 100644 index 0000000..5e7c881 Binary files /dev/null and b/model/wakenet_model/wn9_xiaoyaxiaoya_tts2/wn9_index differ diff --git a/src/esp_mn_speech_commands.c b/src/esp_mn_speech_commands.c index 849fc71..0c6c137 100644 --- a/src/esp_mn_speech_commands.c +++ b/src/esp_mn_speech_commands.c @@ -120,6 +120,10 @@ esp_err_t esp_mn_commands_add(int command_id, const char *string) } #endif +// #if CONFIG_SR_MN_EN_MULTINET7_QUANT || CONFIG_SR_MN_EN_MULTINET6_QUANT || SR_MN_EN_MULTINET5_SINGLE_RECOGNITION_QUANT8 +// ESP_LOGW(TAG, "For English, please use esp_mn_commands_phoneme_add() to add graphemes and phonemes!"); +// #endif + temp = esp_mn_command_search(string); if (temp != NULL) { @@ -166,13 +170,15 @@ esp_err_t esp_mn_commands_phoneme_add(int command_id, const char *string, const int last_node_elem_num = esp_mn_commands_num(); ESP_RETURN_ON_FALSE(ESP_MN_MAX_PHRASE_NUM >= last_node_elem_num, ESP_ERR_INVALID_STATE, TAG, "The number of speech commands exceed ESP_MN_MAX_PHRASE_NUM"); -#ifdef CONFIG_SR_MN_EN_MULTINET7_QUANT +#if CONFIG_SR_MN_EN_MULTINET7_QUANT || CONFIG_SR_MN_EN_MULTINET5_SINGLE_RECOGNITION_QUANT8 + // the unit of multinet7 or multinet5q8 is phoneme if (esp_mn_model_handle->check_speech_command(esp_mn_model_data, phonemes) == 0) { // error message is printed inside check_speech_command ESP_LOGE(TAG, "invalid command, please check format, %s (%s).\n", string, phonemes); return ESP_ERR_INVALID_STATE; } #else + // The unit of multinet6 is grapheme if (esp_mn_model_handle->check_speech_command(esp_mn_model_data, string) == 0) { // error message is printed inside check_speech_command ESP_LOGE(TAG, "invalid command, please check format, %s.\n", string); @@ -195,16 +201,22 @@ esp_err_t esp_mn_commands_phoneme_add(int command_id, const char *string, const } temp = esp_mn_root; - +#if CONFIG_SR_MN_EN_MULTINET5_SINGLE_RECOGNITION_QUANT8 + //TODO:: add string for mn5 + esp_mn_phrase_t *phrase = esp_mn_phrase_alloc(command_id, phonemes); +#else esp_mn_phrase_t *phrase = esp_mn_phrase_alloc(command_id, string); +#endif if (phrase == NULL) { return ESP_ERR_INVALID_STATE; } - int phoneme_len = strlen(phonemes); - phrase->phonemes = _esp_mn_calloc_(phoneme_len+1, sizeof(char)); - memcpy(phrase->phonemes, phonemes, phoneme_len); - phrase->phonemes[phoneme_len] = '\0'; - + if (phonemes) { + int phoneme_len = strlen(phonemes); + phrase->phonemes = _esp_mn_calloc_(phoneme_len+1, sizeof(char)); + memcpy(phrase->phonemes, phonemes, phoneme_len); + phrase->phonemes[phoneme_len] = '\0'; + } + esp_mn_node_t *new_node = esp_mn_node_alloc(phrase); while (temp->next != NULL) { temp = temp->next; diff --git a/src/esp_process_sdkconfig.c b/src/esp_process_sdkconfig.c index 120d368..1fe6d09 100644 --- a/src/esp_process_sdkconfig.c +++ b/src/esp_process_sdkconfig.c @@ -950,4 +950,4 @@ end: esp_mn_commands_print(); return esp_mn_commands_update(); -} +} \ No newline at end of file diff --git a/test_apps/esp-sr/main/test_multinet.cpp b/test_apps/esp-sr/main/test_multinet.cpp index 369e23a..49778f0 100644 --- a/test_apps/esp-sr/main/test_multinet.cpp +++ b/test_apps/esp-sr/main/test_multinet.cpp @@ -125,6 +125,83 @@ TEST_CASE("multinet cpu loading", "[mn]") TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED); } +// TEST_CASE("multinet set commands and detect", "[mn]") +// { +// vTaskDelay(500 / portTICK_PERIOD_MS); +// srmodel_list_t *models = esp_srmodel_init("model"); +// char *model_name = esp_srmodel_filter(models, ESP_MN_PREFIX, NULL); +// esp_mn_iface_t *multinet = esp_mn_handle_from_name(model_name); + +// model_iface_data_t *model_data = multinet->create(model_name, 6000); +// int frequency = multinet->get_samp_rate(model_data); +// int audio_chunksize = multinet->get_samp_chunksize(model_data) * sizeof(int16_t); +// char *lang = multinet->get_language(model_data); +// esp_mn_commands_update_from_sdkconfig(multinet, model_data); +// unsigned char* data = NULL; +// size_t data_size = 0; +// if (strcmp(lang, ESP_MN_ENGLISH) == 0) { +// data = (unsigned char*)tell_me_a_joke; +// data_size = sizeof(tell_me_a_joke); +// printf("commands: tell me a joke, size:%d\n", data_size); +// } else if(strcmp(lang, ESP_MN_CHINESE) == 0) { +// data = (unsigned char*)da_kai_kong_tiao; +// data_size = sizeof(da_kai_kong_tiao); +// printf("commands: da kai kong tiao, size:%d\n", data_size); +// } + +// int16_t *buffer = (int16_t *) malloc(audio_chunksize); +// int chunks = 0; +// struct timeval tv_start, tv_end; +// gettimeofday(&tv_start, NULL); +// esp_mn_state_t mn_state; +// esp_mn_error_t *error_phrases = NULL; +// esp_mn_commands_clear(); + +// if (strcmp(lang, ESP_MN_ENGLISH) == 0) { +// esp_mn_commands_phoneme_add(1, "TELL ME A JOKE", "TfL Mm c qbK"); +// esp_mn_commands_phoneme_add(2, "SING A SONG", "Sgl c Sel"); +// error_phrases = esp_mn_commands_update(); +// } else if(strcmp(lang, ESP_MN_CHINESE) == 0) { +// esp_mn_commands_add(1, "da kai kong tiao"); +// esp_mn_commands_add(2, "guan bi kong tiao"); +// error_phrases = esp_mn_commands_update(); +// } else { +// printf("Invalid language\n"); +// } +// multinet->print_active_speech_commands(model_data); + +// while (1) { +// if ((chunks + 1)*audio_chunksize <= data_size) { +// memcpy(buffer, data + chunks * audio_chunksize, audio_chunksize); +// } else { +// memset(buffer, 0, audio_chunksize); +// } +// mn_state = multinet->detect(model_data, buffer); +// if (mn_state == ESP_MN_STATE_DETECTED) { +// esp_mn_results_t *mn_result = multinet->get_results(model_data); +// if (mn_result->num > 0) +// printf("detected: command id:%d, string:%s\n",mn_result->command_id[0], mn_result->string); +// else +// printf("timeout\n"); +// break; +// } +// chunks++; +// if (chunks > 600) +// break; +// } +// gettimeofday(&tv_end, NULL); +// int tv_ms=(tv_end.tv_sec-tv_start.tv_sec)*1000+(tv_end.tv_usec-tv_start.tv_usec)/1000; +// chunks -= 7; +// int run_ms = (chunks)*audio_chunksize/sizeof(int16_t)*1000/frequency; +// printf("Done! Took %d ms to parse %d ms worth of samples in %d iterations. CPU loading(single core):%.1f%%\n", +// tv_ms, run_ms, chunks, tv_ms*100.0/run_ms); + +// multinet->destroy(model_data); +// esp_srmodel_deinit(models); +// TEST_ASSERT_EQUAL(true, error_phrases == NULL); +// TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED); +// } + TEST_CASE("multinet set commands", "[mn]") { vTaskDelay(500 / portTICK_PERIOD_MS); @@ -367,4 +444,4 @@ TEST_CASE("multinet modify commands", "[mn]") multinet->destroy(model_data); esp_srmodel_deinit(models); TEST_ASSERT_EQUAL(true, 1); -} \ No newline at end of file +}