mirror of
https://github.com/espressif/esp-sr.git
synced 2025-09-15 15:28:44 +08:00
model(wn9): add xiaoyaxiaoya and hijason wakenet9 model
This commit is contained in:
parent
32d1339772
commit
a5dc2e511b
@ -105,7 +105,7 @@ choice SR_WN_MODEL_LOAD
|
||||
config SR_WN_WN9_HIMFIVE
|
||||
bool "Hi,M Five (wn9_himfive)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
|
||||
config SR_WN_WN9_NIHAOXIAOZHI_TTS
|
||||
bool "你好小智 (wn9_nihaoxiaozhi_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
@ -186,6 +186,14 @@ choice SR_WN_MODEL_LOAD
|
||||
bool "Astrolabe (wn9_astrolabe_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_XIAOYAXIAOYA_TTS2
|
||||
bool "小鸭小鸭 (wn9_xiaoyaxiaoya_tts2)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_HIJASON_TTS2
|
||||
bool "Hi,Jason (wn9_hijason_tts2)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_CUSTOMWORD
|
||||
bool "customized word (wn9_customword)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
@ -293,6 +301,14 @@ menu "Load Multiple Wake Words"
|
||||
bool "Astrolabe (wn9_astrolabe_tts)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_XIAOYAXIAOYA_TTS2_MULTI
|
||||
bool "小鸭小鸭 (wn9_xiaoyaxiaoya_tts2)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
config SR_WN_WN9_HIJASON_TTS2_MULTI
|
||||
bool "Hi,Jason (wn9_hijason_tts2)"
|
||||
depends on IDF_TARGET_ESP32S3 || IDF_TARGET_ESP32P4
|
||||
|
||||
endmenu
|
||||
|
||||
config USE_MULTINET
|
||||
|
||||
@ -47,6 +47,7 @@ The following wake words are supported in esp-sr:
|
||||
|Hi,Joy | | wn9_hijoy_tts |
|
||||
|Hey,Wand | | wn9_heywanda_tts |
|
||||
|Astrolabe | | wn9_astrolabe_tts |
|
||||
|Hi,Jason | | wn9_hijason_tts2 |
|
||||
|你好小鑫 | | wn9_nihaoxiaoxin_tts |
|
||||
|小美同学 | | wn9_xiaomeitongxue_tts |
|
||||
|Hi,小星 | | wn9_hixiaoxing_tts |
|
||||
@ -57,8 +58,9 @@ The following wake words are supported in esp-sr:
|
||||
|Hi,Telly/Hi,泰力 | | wn9_hitelly_tts |
|
||||
|小滨小滨/小冰小冰| | wn9_xiaobinxiaobin_tts |
|
||||
|Hi,小巫 | | wn9_haixiaowu_tts |
|
||||
|小鸭小鸭 | | wn9_xiaoyaxiaoya_tts2 |
|
||||
|
||||
*NOTE:* `_tts` suffix means this WakeNet model is trained by TTS samples.
|
||||
*NOTE:* `_tts` suffix means this WakeNet model is trained by TTS samples. `_tts2` suffix means this WakeNet model is trained by TTS Pipeline V2.
|
||||
|
||||
## Speech Command Recognition
|
||||
|
||||
|
||||
@ -1,12 +1,14 @@
|
||||
import io
|
||||
import os
|
||||
import argparse
|
||||
import shutil
|
||||
import io
|
||||
import math
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
|
||||
sys.dont_write_bytecode = True
|
||||
from pack_model import pack_models
|
||||
|
||||
|
||||
def calculate_total_size(folder_path):
|
||||
total_size = 0
|
||||
for file_name in os.listdir(folder_path):
|
||||
@ -103,6 +105,26 @@ def copy_nsnet_from_sdkconfig(model_path, sdkconfig_path, target_path):
|
||||
for item in models:
|
||||
shutil.copytree(model_path + '/nsnet_model/' + item, target_path+'/'+item)
|
||||
|
||||
def copy_vadnet_from_sdkconfig(model_path, sdkconfig_path, target_path):
|
||||
"""
|
||||
Copy vadnet model from model_path to target_path based on sdkconfig
|
||||
"""
|
||||
with io.open(sdkconfig_path, "r") as f:
|
||||
models_string = ''
|
||||
for label in f:
|
||||
label = label.strip("\n")
|
||||
if 'CONFIG_SR_VADNET' in label and label[0] != '#':
|
||||
models_string += label
|
||||
|
||||
models = []
|
||||
if "CONFIG_SR_VADNET_MODLE_SMALL" in models_string:
|
||||
models.append('vadnet1_small')
|
||||
elif "CONFIG_SR_VADNET_MODLE_MEDIUM" in models_string:
|
||||
models.append('vadnet1_medium')
|
||||
|
||||
for item in models:
|
||||
shutil.copytree(model_path + '/vadnet_model/' + item, target_path+'/'+item)
|
||||
|
||||
if __name__ == '__main__':
|
||||
parser = argparse.ArgumentParser(description='Model generator tool')
|
||||
parser.add_argument('-d1', '--sdkconfig_path')
|
||||
@ -122,6 +144,7 @@ if __name__ == '__main__':
|
||||
copy_multinet_from_sdkconfig(model_path, sdkconfig_path, target_path)
|
||||
copy_wakenet_from_sdkconfig(model_path, sdkconfig_path, target_path)
|
||||
copy_nsnet_from_sdkconfig(model_path, sdkconfig_path, target_path)
|
||||
copy_vadnet_from_sdkconfig(model_path, sdkconfig_path, target_path)
|
||||
pack_models(target_path, image_file)
|
||||
total_size = os.path.getsize(os.path.join(target_path, image_file))
|
||||
recommended_size = int(math.ceil(total_size/1024))
|
||||
|
||||
1
model/wakenet_model/wn9_hijason_tts2/_MODEL_INFO_
Normal file
1
model/wakenet_model/wn9_hijason_tts2/_MODEL_INFO_
Normal file
@ -0,0 +1 @@
|
||||
wakenet9l_tts2h12_Hi,Jason_3_0.642_0.645
|
||||
BIN
model/wakenet_model/wn9_hijason_tts2/wn9_data
Normal file
BIN
model/wakenet_model/wn9_hijason_tts2/wn9_data
Normal file
Binary file not shown.
BIN
model/wakenet_model/wn9_hijason_tts2/wn9_index
Normal file
BIN
model/wakenet_model/wn9_hijason_tts2/wn9_index
Normal file
Binary file not shown.
1
model/wakenet_model/wn9_xiaoyaxiaoya_tts2/_MODEL_INFO_
Normal file
1
model/wakenet_model/wn9_xiaoyaxiaoya_tts2/_MODEL_INFO_
Normal file
@ -0,0 +1 @@
|
||||
wakenet9_tts2h12_小鸭小鸭_3_0.595_0.600
|
||||
BIN
model/wakenet_model/wn9_xiaoyaxiaoya_tts2/wn9_data
Normal file
BIN
model/wakenet_model/wn9_xiaoyaxiaoya_tts2/wn9_data
Normal file
Binary file not shown.
BIN
model/wakenet_model/wn9_xiaoyaxiaoya_tts2/wn9_index
Normal file
BIN
model/wakenet_model/wn9_xiaoyaxiaoya_tts2/wn9_index
Normal file
Binary file not shown.
@ -120,6 +120,10 @@ esp_err_t esp_mn_commands_add(int command_id, const char *string)
|
||||
}
|
||||
#endif
|
||||
|
||||
// #if CONFIG_SR_MN_EN_MULTINET7_QUANT || CONFIG_SR_MN_EN_MULTINET6_QUANT || SR_MN_EN_MULTINET5_SINGLE_RECOGNITION_QUANT8
|
||||
// ESP_LOGW(TAG, "For English, please use esp_mn_commands_phoneme_add() to add graphemes and phonemes!");
|
||||
// #endif
|
||||
|
||||
temp = esp_mn_command_search(string);
|
||||
|
||||
if (temp != NULL) {
|
||||
@ -166,13 +170,15 @@ esp_err_t esp_mn_commands_phoneme_add(int command_id, const char *string, const
|
||||
int last_node_elem_num = esp_mn_commands_num();
|
||||
ESP_RETURN_ON_FALSE(ESP_MN_MAX_PHRASE_NUM >= last_node_elem_num, ESP_ERR_INVALID_STATE, TAG, "The number of speech commands exceed ESP_MN_MAX_PHRASE_NUM");
|
||||
|
||||
#ifdef CONFIG_SR_MN_EN_MULTINET7_QUANT
|
||||
#if CONFIG_SR_MN_EN_MULTINET7_QUANT || CONFIG_SR_MN_EN_MULTINET5_SINGLE_RECOGNITION_QUANT8
|
||||
// the unit of multinet7 or multinet5q8 is phoneme
|
||||
if (esp_mn_model_handle->check_speech_command(esp_mn_model_data, phonemes) == 0) {
|
||||
// error message is printed inside check_speech_command
|
||||
ESP_LOGE(TAG, "invalid command, please check format, %s (%s).\n", string, phonemes);
|
||||
return ESP_ERR_INVALID_STATE;
|
||||
}
|
||||
#else
|
||||
// The unit of multinet6 is grapheme
|
||||
if (esp_mn_model_handle->check_speech_command(esp_mn_model_data, string) == 0) {
|
||||
// error message is printed inside check_speech_command
|
||||
ESP_LOGE(TAG, "invalid command, please check format, %s.\n", string);
|
||||
@ -195,16 +201,22 @@ esp_err_t esp_mn_commands_phoneme_add(int command_id, const char *string, const
|
||||
}
|
||||
|
||||
temp = esp_mn_root;
|
||||
|
||||
#if CONFIG_SR_MN_EN_MULTINET5_SINGLE_RECOGNITION_QUANT8
|
||||
//TODO:: add string for mn5
|
||||
esp_mn_phrase_t *phrase = esp_mn_phrase_alloc(command_id, phonemes);
|
||||
#else
|
||||
esp_mn_phrase_t *phrase = esp_mn_phrase_alloc(command_id, string);
|
||||
#endif
|
||||
if (phrase == NULL) {
|
||||
return ESP_ERR_INVALID_STATE;
|
||||
}
|
||||
int phoneme_len = strlen(phonemes);
|
||||
phrase->phonemes = _esp_mn_calloc_(phoneme_len+1, sizeof(char));
|
||||
memcpy(phrase->phonemes, phonemes, phoneme_len);
|
||||
phrase->phonemes[phoneme_len] = '\0';
|
||||
|
||||
if (phonemes) {
|
||||
int phoneme_len = strlen(phonemes);
|
||||
phrase->phonemes = _esp_mn_calloc_(phoneme_len+1, sizeof(char));
|
||||
memcpy(phrase->phonemes, phonemes, phoneme_len);
|
||||
phrase->phonemes[phoneme_len] = '\0';
|
||||
}
|
||||
|
||||
esp_mn_node_t *new_node = esp_mn_node_alloc(phrase);
|
||||
while (temp->next != NULL) {
|
||||
temp = temp->next;
|
||||
|
||||
@ -950,4 +950,4 @@ end:
|
||||
esp_mn_commands_print();
|
||||
|
||||
return esp_mn_commands_update();
|
||||
}
|
||||
}
|
||||
@ -125,87 +125,82 @@ TEST_CASE("multinet cpu loading", "[mn]")
|
||||
TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED);
|
||||
}
|
||||
|
||||
TEST_CASE("multinet set commands and detect", "[mn]")
|
||||
{
|
||||
vTaskDelay(500 / portTICK_PERIOD_MS);
|
||||
srmodel_list_t *models = esp_srmodel_init("model");
|
||||
char *model_name = esp_srmodel_filter(models, ESP_MN_PREFIX, NULL);
|
||||
esp_mn_iface_t *multinet = esp_mn_handle_from_name(model_name);
|
||||
// TEST_CASE("multinet set commands and detect", "[mn]")
|
||||
// {
|
||||
// vTaskDelay(500 / portTICK_PERIOD_MS);
|
||||
// srmodel_list_t *models = esp_srmodel_init("model");
|
||||
// char *model_name = esp_srmodel_filter(models, ESP_MN_PREFIX, NULL);
|
||||
// esp_mn_iface_t *multinet = esp_mn_handle_from_name(model_name);
|
||||
|
||||
model_iface_data_t *model_data = multinet->create(model_name, 6000);
|
||||
int frequency = multinet->get_samp_rate(model_data);
|
||||
int audio_chunksize = multinet->get_samp_chunksize(model_data) * sizeof(int16_t);
|
||||
char *lang = multinet->get_language(model_data);
|
||||
esp_mn_commands_update_from_sdkconfig(multinet, model_data);
|
||||
unsigned char* data = NULL;
|
||||
size_t data_size = 0;
|
||||
if (strcmp(lang, ESP_MN_ENGLISH) == 0) {
|
||||
data = (unsigned char*)tell_me_a_joke;
|
||||
data_size = sizeof(tell_me_a_joke);
|
||||
printf("commands: tell me a joke, size:%d\n", data_size);
|
||||
} else if(strcmp(lang, ESP_MN_CHINESE) == 0) {
|
||||
data = (unsigned char*)da_kai_kong_tiao;
|
||||
data_size = sizeof(da_kai_kong_tiao);
|
||||
printf("commands: da kai kong tiao, size:%d\n", data_size);
|
||||
}
|
||||
// model_iface_data_t *model_data = multinet->create(model_name, 6000);
|
||||
// int frequency = multinet->get_samp_rate(model_data);
|
||||
// int audio_chunksize = multinet->get_samp_chunksize(model_data) * sizeof(int16_t);
|
||||
// char *lang = multinet->get_language(model_data);
|
||||
// esp_mn_commands_update_from_sdkconfig(multinet, model_data);
|
||||
// unsigned char* data = NULL;
|
||||
// size_t data_size = 0;
|
||||
// if (strcmp(lang, ESP_MN_ENGLISH) == 0) {
|
||||
// data = (unsigned char*)tell_me_a_joke;
|
||||
// data_size = sizeof(tell_me_a_joke);
|
||||
// printf("commands: tell me a joke, size:%d\n", data_size);
|
||||
// } else if(strcmp(lang, ESP_MN_CHINESE) == 0) {
|
||||
// data = (unsigned char*)da_kai_kong_tiao;
|
||||
// data_size = sizeof(da_kai_kong_tiao);
|
||||
// printf("commands: da kai kong tiao, size:%d\n", data_size);
|
||||
// }
|
||||
|
||||
int16_t *buffer = (int16_t *) malloc(audio_chunksize);
|
||||
int chunks = 0;
|
||||
struct timeval tv_start, tv_end;
|
||||
gettimeofday(&tv_start, NULL);
|
||||
esp_mn_state_t mn_state;
|
||||
esp_mn_error_t *error_phrases = NULL;
|
||||
esp_mn_commands_clear();
|
||||
// int16_t *buffer = (int16_t *) malloc(audio_chunksize);
|
||||
// int chunks = 0;
|
||||
// struct timeval tv_start, tv_end;
|
||||
// gettimeofday(&tv_start, NULL);
|
||||
// esp_mn_state_t mn_state;
|
||||
// esp_mn_error_t *error_phrases = NULL;
|
||||
// esp_mn_commands_clear();
|
||||
|
||||
if (strcmp(lang, ESP_MN_ENGLISH) == 0) {
|
||||
if (strcmp(model_name, "mn5q8_en") == 0) {
|
||||
esp_mn_commands_add(1, "TfL Mm c qbK");
|
||||
esp_mn_commands_add(2, "Sgl c Sel");
|
||||
} else {
|
||||
esp_mn_commands_add(1, "TELL ME A JOKE");
|
||||
esp_mn_commands_add(2, "SING A SONG");
|
||||
}
|
||||
error_phrases = esp_mn_commands_update();
|
||||
} else if(strcmp(lang, ESP_MN_CHINESE) == 0) {
|
||||
esp_mn_commands_add(1, "da kai kong tiao");
|
||||
esp_mn_commands_add(2, "guan bi kong tiao");
|
||||
error_phrases = esp_mn_commands_update();
|
||||
} else {
|
||||
printf("Invalid language\n");
|
||||
}
|
||||
multinet->print_active_speech_commands(model_data);
|
||||
// if (strcmp(lang, ESP_MN_ENGLISH) == 0) {
|
||||
// esp_mn_commands_phoneme_add(1, "TELL ME A JOKE", "TfL Mm c qbK");
|
||||
// esp_mn_commands_phoneme_add(2, "SING A SONG", "Sgl c Sel");
|
||||
// error_phrases = esp_mn_commands_update();
|
||||
// } else if(strcmp(lang, ESP_MN_CHINESE) == 0) {
|
||||
// esp_mn_commands_add(1, "da kai kong tiao");
|
||||
// esp_mn_commands_add(2, "guan bi kong tiao");
|
||||
// error_phrases = esp_mn_commands_update();
|
||||
// } else {
|
||||
// printf("Invalid language\n");
|
||||
// }
|
||||
// multinet->print_active_speech_commands(model_data);
|
||||
|
||||
while (1) {
|
||||
if ((chunks + 1)*audio_chunksize <= data_size) {
|
||||
memcpy(buffer, data + chunks * audio_chunksize, audio_chunksize);
|
||||
} else {
|
||||
memset(buffer, 0, audio_chunksize);
|
||||
}
|
||||
mn_state = multinet->detect(model_data, buffer);
|
||||
if (mn_state == ESP_MN_STATE_DETECTED) {
|
||||
esp_mn_results_t *mn_result = multinet->get_results(model_data);
|
||||
if (mn_result->num > 0)
|
||||
printf("detected: command id:%d, string:%s\n",mn_result->command_id[0], mn_result->string);
|
||||
else
|
||||
printf("timeout\n");
|
||||
break;
|
||||
}
|
||||
chunks++;
|
||||
if (chunks > 600)
|
||||
break;
|
||||
}
|
||||
gettimeofday(&tv_end, NULL);
|
||||
int tv_ms=(tv_end.tv_sec-tv_start.tv_sec)*1000+(tv_end.tv_usec-tv_start.tv_usec)/1000;
|
||||
chunks -= 7;
|
||||
int run_ms = (chunks)*audio_chunksize/sizeof(int16_t)*1000/frequency;
|
||||
printf("Done! Took %d ms to parse %d ms worth of samples in %d iterations. CPU loading(single core):%.1f%%\n",
|
||||
tv_ms, run_ms, chunks, tv_ms*100.0/run_ms);
|
||||
// while (1) {
|
||||
// if ((chunks + 1)*audio_chunksize <= data_size) {
|
||||
// memcpy(buffer, data + chunks * audio_chunksize, audio_chunksize);
|
||||
// } else {
|
||||
// memset(buffer, 0, audio_chunksize);
|
||||
// }
|
||||
// mn_state = multinet->detect(model_data, buffer);
|
||||
// if (mn_state == ESP_MN_STATE_DETECTED) {
|
||||
// esp_mn_results_t *mn_result = multinet->get_results(model_data);
|
||||
// if (mn_result->num > 0)
|
||||
// printf("detected: command id:%d, string:%s\n",mn_result->command_id[0], mn_result->string);
|
||||
// else
|
||||
// printf("timeout\n");
|
||||
// break;
|
||||
// }
|
||||
// chunks++;
|
||||
// if (chunks > 600)
|
||||
// break;
|
||||
// }
|
||||
// gettimeofday(&tv_end, NULL);
|
||||
// int tv_ms=(tv_end.tv_sec-tv_start.tv_sec)*1000+(tv_end.tv_usec-tv_start.tv_usec)/1000;
|
||||
// chunks -= 7;
|
||||
// int run_ms = (chunks)*audio_chunksize/sizeof(int16_t)*1000/frequency;
|
||||
// printf("Done! Took %d ms to parse %d ms worth of samples in %d iterations. CPU loading(single core):%.1f%%\n",
|
||||
// tv_ms, run_ms, chunks, tv_ms*100.0/run_ms);
|
||||
|
||||
multinet->destroy(model_data);
|
||||
esp_srmodel_deinit(models);
|
||||
TEST_ASSERT_EQUAL(true, error_phrases == NULL);
|
||||
TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED);
|
||||
}
|
||||
// multinet->destroy(model_data);
|
||||
// esp_srmodel_deinit(models);
|
||||
// TEST_ASSERT_EQUAL(true, error_phrases == NULL);
|
||||
// TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED);
|
||||
// }
|
||||
|
||||
TEST_CASE("multinet set commands", "[mn]")
|
||||
{
|
||||
|
||||
Loading…
Reference in New Issue
Block a user