mirror of
https://github.com/espressif/esp-sr.git
synced 2025-09-15 15:28:44 +08:00
Merge branch 'model/hiwalle' into 'master'
Model/hiwalle See merge request speech-recognition-framework/esp-sr!147
This commit is contained in:
commit
a91e427bb3
@ -196,6 +196,11 @@ menu "Load Multiple Wake Words"
|
||||
config SR_WN_WN9_XIAOMINGTONGXUE_TTS2
|
||||
bool "小明同学 (wn9_xiaomingtongxue_tts2)"
|
||||
default False
|
||||
|
||||
|
||||
config SR_WN_WN9_HIWALLE_TTS2
|
||||
bool "Hi Wall E or Hi 瓦力(wn9_hiwalle_tts2)"
|
||||
default False
|
||||
endmenu
|
||||
|
||||
|
||||
|
||||
@ -54,6 +54,7 @@ The following wake words are supported in esp-sr:
|
||||
|Hey,Wand | | wn9_heywanda_tts |
|
||||
|Astrolabe | | wn9_astrolabe_tts |
|
||||
|Hi,Jason | | wn9_hijason_tts2 |
|
||||
|Hi,Wall E/Hi,瓦力| | wn9_hiwalle_tts2 |
|
||||
|你好小鑫 | | wn9_nihaoxiaoxin_tts |
|
||||
|小美同学 | | wn9_xiaomeitongxue_tts |
|
||||
|Hi,小星 | | wn9_hixiaoxing_tts |
|
||||
|
||||
@ -126,7 +126,7 @@ typedef struct {
|
||||
afe_agc_mode_t
|
||||
agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
|
||||
int agc_compression_gain_db; // Compression gain in dB (default 9)
|
||||
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
|
||||
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default 3, means target level is -3 dBFS)
|
||||
|
||||
/********** General AFE(Audio Front End) parameter **********/
|
||||
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.
|
||||
|
||||
1
model/wakenet_model/wn9_hiwalle_tts2/_MODEL_INFO_
Normal file
1
model/wakenet_model/wn9_hiwalle_tts2/_MODEL_INFO_
Normal file
@ -0,0 +1 @@
|
||||
wakenet9l_tts2h12_Hi,Wall,E or Hi,瓦力_3_0.630_0.635
|
||||
BIN
model/wakenet_model/wn9_hiwalle_tts2/wn9_data
Normal file
BIN
model/wakenet_model/wn9_hiwalle_tts2/wn9_data
Normal file
Binary file not shown.
BIN
model/wakenet_model/wn9_hiwalle_tts2/wn9_index
Normal file
BIN
model/wakenet_model/wn9_hiwalle_tts2/wn9_index
Normal file
Binary file not shown.
@ -126,82 +126,148 @@ TEST_CASE("multinet cpu loading", "[mn]")
|
||||
TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED);
|
||||
}
|
||||
|
||||
// TEST_CASE("multinet set commands and detect", "[mn]")
|
||||
// {
|
||||
// vTaskDelay(500 / portTICK_PERIOD_MS);
|
||||
// srmodel_list_t *models = esp_srmodel_init("model");
|
||||
// char *model_name = esp_srmodel_filter(models, ESP_MN_PREFIX, NULL);
|
||||
// esp_mn_iface_t *multinet = esp_mn_handle_from_name(model_name);
|
||||
TEST_CASE("multinet set commands and detect", "[mn]")
|
||||
{
|
||||
vTaskDelay(500 / portTICK_PERIOD_MS);
|
||||
srmodel_list_t *models = esp_srmodel_init("model");
|
||||
char *model_name = esp_srmodel_filter(models, ESP_MN_PREFIX, NULL);
|
||||
esp_mn_iface_t *multinet = esp_mn_handle_from_name(model_name);
|
||||
|
||||
// model_iface_data_t *model_data = multinet->create(model_name, 6000);
|
||||
// int frequency = multinet->get_samp_rate(model_data);
|
||||
// int audio_chunksize = multinet->get_samp_chunksize(model_data) * sizeof(int16_t);
|
||||
// char *lang = multinet->get_language(model_data);
|
||||
// esp_mn_commands_update_from_sdkconfig(multinet, model_data);
|
||||
// unsigned char* data = NULL;
|
||||
// size_t data_size = 0;
|
||||
// if (strcmp(lang, ESP_MN_ENGLISH) == 0) {
|
||||
// data = (unsigned char*)tell_me_a_joke;
|
||||
// data_size = sizeof(tell_me_a_joke);
|
||||
// printf("commands: tell me a joke, size:%d\n", data_size);
|
||||
// } else if(strcmp(lang, ESP_MN_CHINESE) == 0) {
|
||||
// data = (unsigned char*)da_kai_kong_tiao;
|
||||
// data_size = sizeof(da_kai_kong_tiao);
|
||||
// printf("commands: da kai kong tiao, size:%d\n", data_size);
|
||||
// }
|
||||
|
||||
// int16_t *buffer = (int16_t *) malloc(audio_chunksize);
|
||||
// int chunks = 0;
|
||||
// struct timeval tv_start, tv_end;
|
||||
// gettimeofday(&tv_start, NULL);
|
||||
// esp_mn_state_t mn_state;
|
||||
// esp_mn_error_t *error_phrases = NULL;
|
||||
// esp_mn_commands_clear();
|
||||
model_iface_data_t *model_data = multinet->create(model_name, 6000);
|
||||
int frequency = multinet->get_samp_rate(model_data);
|
||||
int audio_chunksize = multinet->get_samp_chunksize(model_data) * sizeof(int16_t);
|
||||
char *lang = multinet->get_language(model_data);
|
||||
esp_mn_commands_update_from_sdkconfig(multinet, model_data);
|
||||
unsigned char* data = NULL;
|
||||
size_t data_size = 0;
|
||||
if (strcmp(lang, ESP_MN_ENGLISH) == 0) {
|
||||
data = (unsigned char*)tell_me_a_joke;
|
||||
data_size = sizeof(tell_me_a_joke);
|
||||
printf("commands: tell me a joke, size:%d\n", data_size);
|
||||
} else if(strcmp(lang, ESP_MN_CHINESE) == 0) {
|
||||
data = (unsigned char*)da_kai_kong_tiao;
|
||||
data_size = sizeof(da_kai_kong_tiao);
|
||||
printf("commands: da kai kong tiao, size:%d\n", data_size);
|
||||
}
|
||||
|
||||
int16_t *buffer = (int16_t *) malloc(audio_chunksize);
|
||||
int chunks = 0;
|
||||
struct timeval tv_start, tv_end;
|
||||
gettimeofday(&tv_start, NULL);
|
||||
esp_mn_state_t mn_state;
|
||||
esp_mn_error_t *error_phrases = NULL;
|
||||
esp_mn_commands_clear();
|
||||
|
||||
// if (strcmp(lang, ESP_MN_ENGLISH) == 0) {
|
||||
// esp_mn_commands_phoneme_add(1, "TELL ME A JOKE", "TfL Mm c qbK");
|
||||
// esp_mn_commands_phoneme_add(2, "SING A SONG", "Sgl c Sel");
|
||||
// error_phrases = esp_mn_commands_update();
|
||||
// } else if(strcmp(lang, ESP_MN_CHINESE) == 0) {
|
||||
// esp_mn_commands_add(1, "da kai kong tiao");
|
||||
// esp_mn_commands_add(2, "guan bi kong tiao");
|
||||
// error_phrases = esp_mn_commands_update();
|
||||
// } else {
|
||||
// printf("Invalid language\n");
|
||||
// }
|
||||
// multinet->print_active_speech_commands(model_data);
|
||||
if (strcmp(lang, ESP_MN_ENGLISH) == 0) {
|
||||
esp_mn_commands_phoneme_add(1, "TELL ME A JOKE", "TfL Mm c qbK");
|
||||
esp_mn_commands_phoneme_add(2, "SING A SONG", "Sgl c Sel");
|
||||
error_phrases = esp_mn_commands_update();
|
||||
} else if(strcmp(lang, ESP_MN_CHINESE) == 0) {
|
||||
esp_mn_commands_add(1, "da kai kong tiao");
|
||||
esp_mn_commands_add(2, "guan bi kong tiao");
|
||||
error_phrases = esp_mn_commands_update();
|
||||
} else {
|
||||
printf("Invalid language\n");
|
||||
}
|
||||
multinet->print_active_speech_commands(model_data);
|
||||
|
||||
while (1) {
|
||||
if ((chunks + 1)*audio_chunksize <= data_size) {
|
||||
memcpy(buffer, data + chunks * audio_chunksize, audio_chunksize);
|
||||
} else {
|
||||
memset(buffer, 0, audio_chunksize);
|
||||
}
|
||||
mn_state = multinet->detect(model_data, buffer);
|
||||
if (mn_state == ESP_MN_STATE_DETECTED) {
|
||||
esp_mn_results_t *mn_result = multinet->get_results(model_data);
|
||||
if (mn_result->num > 0)
|
||||
printf("detected: command id:%d, string:%s\n",mn_result->command_id[0], mn_result->string);
|
||||
else
|
||||
printf("timeout\n");
|
||||
break;
|
||||
}
|
||||
chunks++;
|
||||
if (chunks > 600)
|
||||
break;
|
||||
}
|
||||
gettimeofday(&tv_end, NULL);
|
||||
int tv_ms=(tv_end.tv_sec-tv_start.tv_sec)*1000+(tv_end.tv_usec-tv_start.tv_usec)/1000;
|
||||
chunks -= 7;
|
||||
int run_ms = (chunks)*audio_chunksize/sizeof(int16_t)*1000/frequency;
|
||||
printf("Done! Took %d ms to parse %d ms worth of samples in %d iterations. CPU loading(single core):%.1f%%\n",
|
||||
tv_ms, run_ms, chunks, tv_ms*100.0/run_ms);
|
||||
|
||||
multinet->destroy(model_data);
|
||||
esp_srmodel_deinit(models);
|
||||
TEST_ASSERT_EQUAL(true, error_phrases == NULL);
|
||||
TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED);
|
||||
}
|
||||
|
||||
TEST_CASE("multinet set commands from sdkconfig and detect", "[mn]")
|
||||
{
|
||||
vTaskDelay(500 / portTICK_PERIOD_MS);
|
||||
srmodel_list_t *models = esp_srmodel_init("model");
|
||||
char *model_name = esp_srmodel_filter(models, ESP_MN_PREFIX, NULL);
|
||||
esp_mn_iface_t *multinet = esp_mn_handle_from_name(model_name);
|
||||
|
||||
model_iface_data_t *model_data = multinet->create(model_name, 6000);
|
||||
int frequency = multinet->get_samp_rate(model_data);
|
||||
int audio_chunksize = multinet->get_samp_chunksize(model_data) * sizeof(int16_t);
|
||||
char *lang = multinet->get_language(model_data);
|
||||
esp_mn_commands_update_from_sdkconfig(multinet, model_data);
|
||||
unsigned char* data = NULL;
|
||||
size_t data_size = 0;
|
||||
if (strcmp(lang, ESP_MN_ENGLISH) == 0) {
|
||||
data = (unsigned char*)tell_me_a_joke;
|
||||
data_size = sizeof(tell_me_a_joke);
|
||||
printf("commands: tell me a joke, size:%d\n", data_size);
|
||||
} else if(strcmp(lang, ESP_MN_CHINESE) == 0) {
|
||||
data = (unsigned char*)da_kai_kong_tiao;
|
||||
data_size = sizeof(da_kai_kong_tiao);
|
||||
printf("commands: da kai kong tiao, size:%d\n", data_size);
|
||||
}
|
||||
|
||||
int16_t *buffer = (int16_t *) malloc(audio_chunksize);
|
||||
int chunks = 0;
|
||||
struct timeval tv_start, tv_end;
|
||||
gettimeofday(&tv_start, NULL);
|
||||
esp_mn_state_t mn_state;
|
||||
esp_mn_error_t *error_phrases = NULL;
|
||||
esp_mn_commands_update_from_sdkconfig(multinet, model_data);
|
||||
multinet->print_active_speech_commands(model_data);
|
||||
|
||||
while (1) {
|
||||
if ((chunks + 1)*audio_chunksize <= data_size) {
|
||||
memcpy(buffer, data + chunks * audio_chunksize, audio_chunksize);
|
||||
} else {
|
||||
memset(buffer, 0, audio_chunksize);
|
||||
}
|
||||
mn_state = multinet->detect(model_data, buffer);
|
||||
if (mn_state == ESP_MN_STATE_DETECTED) {
|
||||
esp_mn_results_t *mn_result = multinet->get_results(model_data);
|
||||
if (mn_result->num > 0)
|
||||
printf("detected: command id:%d, string:%s\n",mn_result->command_id[0], mn_result->string);
|
||||
else
|
||||
printf("timeout\n");
|
||||
break;
|
||||
}
|
||||
chunks++;
|
||||
if (chunks > 600)
|
||||
break;
|
||||
}
|
||||
gettimeofday(&tv_end, NULL);
|
||||
int tv_ms=(tv_end.tv_sec-tv_start.tv_sec)*1000+(tv_end.tv_usec-tv_start.tv_usec)/1000;
|
||||
chunks -= 7;
|
||||
int run_ms = (chunks)*audio_chunksize/sizeof(int16_t)*1000/frequency;
|
||||
printf("Done! Took %d ms to parse %d ms worth of samples in %d iterations. CPU loading(single core):%.1f%%\n",
|
||||
tv_ms, run_ms, chunks, tv_ms*100.0/run_ms);
|
||||
|
||||
multinet->destroy(model_data);
|
||||
esp_srmodel_deinit(models);
|
||||
TEST_ASSERT_EQUAL(true, error_phrases == NULL);
|
||||
TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED);
|
||||
}
|
||||
|
||||
// while (1) {
|
||||
// if ((chunks + 1)*audio_chunksize <= data_size) {
|
||||
// memcpy(buffer, data + chunks * audio_chunksize, audio_chunksize);
|
||||
// } else {
|
||||
// memset(buffer, 0, audio_chunksize);
|
||||
// }
|
||||
// mn_state = multinet->detect(model_data, buffer);
|
||||
// if (mn_state == ESP_MN_STATE_DETECTED) {
|
||||
// esp_mn_results_t *mn_result = multinet->get_results(model_data);
|
||||
// if (mn_result->num > 0)
|
||||
// printf("detected: command id:%d, string:%s\n",mn_result->command_id[0], mn_result->string);
|
||||
// else
|
||||
// printf("timeout\n");
|
||||
// break;
|
||||
// }
|
||||
// chunks++;
|
||||
// if (chunks > 600)
|
||||
// break;
|
||||
// }
|
||||
// gettimeofday(&tv_end, NULL);
|
||||
// int tv_ms=(tv_end.tv_sec-tv_start.tv_sec)*1000+(tv_end.tv_usec-tv_start.tv_usec)/1000;
|
||||
// chunks -= 7;
|
||||
// int run_ms = (chunks)*audio_chunksize/sizeof(int16_t)*1000/frequency;
|
||||
// printf("Done! Took %d ms to parse %d ms worth of samples in %d iterations. CPU loading(single core):%.1f%%\n",
|
||||
// tv_ms, run_ms, chunks, tv_ms*100.0/run_ms);
|
||||
|
||||
// multinet->destroy(model_data);
|
||||
// esp_srmodel_deinit(models);
|
||||
// TEST_ASSERT_EQUAL(true, error_phrases == NULL);
|
||||
// TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED);
|
||||
// }
|
||||
|
||||
TEST_CASE("multinet set commands", "[mn]")
|
||||
{
|
||||
|
||||
Loading…
Reference in New Issue
Block a user