Merge branch 'model/hiwalle' into 'master'

Model/hiwalle

See merge request speech-recognition-framework/esp-sr!147
This commit is contained in:
Sun Xiang Yu 2025-03-10 19:47:32 +08:00
commit a91e427bb3
7 changed files with 147 additions and 74 deletions

View File

@ -196,6 +196,11 @@ menu "Load Multiple Wake Words"
config SR_WN_WN9_XIAOMINGTONGXUE_TTS2
bool "小明同学 (wn9_xiaomingtongxue_tts2)"
default False
config SR_WN_WN9_HIWALLE_TTS2
bool "Hi Wall E or Hi 瓦力(wn9_hiwalle_tts2)"
default False
endmenu

View File

@ -54,6 +54,7 @@ The following wake words are supported in esp-sr:
|Hey,Wand | | wn9_heywanda_tts |
|Astrolabe | | wn9_astrolabe_tts |
|Hi,Jason | | wn9_hijason_tts2 |
|Hi,Wall E/Hi,瓦力| | wn9_hiwalle_tts2 |
|你好小鑫 | | wn9_nihaoxiaoxin_tts |
|小美同学 | | wn9_xiaomeitongxue_tts |
|Hi,小星 | | wn9_hixiaoxing_tts |

View File

@ -126,7 +126,7 @@ typedef struct {
afe_agc_mode_t
agc_mode; // The AGC mode for ASR. and the gain generated by AGC acts on the audio after far linear gain.
int agc_compression_gain_db; // Compression gain in dB (default 9)
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default -3)
int agc_target_level_dbfs; // Target level in -dBfs of envelope (default 3, means target level is -3 dBFS)
/********** General AFE(Audio Front End) parameter **********/
afe_pcm_config_t pcm_config; // Config the channel num of original data which is fed to the afe feed function.

View File

@ -0,0 +1 @@
wakenet9l_tts2h12_Hi,Wall,E or Hi,瓦力_3_0.630_0.635

Binary file not shown.

Binary file not shown.

View File

@ -126,82 +126,148 @@ TEST_CASE("multinet cpu loading", "[mn]")
TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED);
}
// TEST_CASE("multinet set commands and detect", "[mn]")
// {
// vTaskDelay(500 / portTICK_PERIOD_MS);
// srmodel_list_t *models = esp_srmodel_init("model");
// char *model_name = esp_srmodel_filter(models, ESP_MN_PREFIX, NULL);
// esp_mn_iface_t *multinet = esp_mn_handle_from_name(model_name);
TEST_CASE("multinet set commands and detect", "[mn]")
{
vTaskDelay(500 / portTICK_PERIOD_MS);
srmodel_list_t *models = esp_srmodel_init("model");
char *model_name = esp_srmodel_filter(models, ESP_MN_PREFIX, NULL);
esp_mn_iface_t *multinet = esp_mn_handle_from_name(model_name);
// model_iface_data_t *model_data = multinet->create(model_name, 6000);
// int frequency = multinet->get_samp_rate(model_data);
// int audio_chunksize = multinet->get_samp_chunksize(model_data) * sizeof(int16_t);
// char *lang = multinet->get_language(model_data);
// esp_mn_commands_update_from_sdkconfig(multinet, model_data);
// unsigned char* data = NULL;
// size_t data_size = 0;
// if (strcmp(lang, ESP_MN_ENGLISH) == 0) {
// data = (unsigned char*)tell_me_a_joke;
// data_size = sizeof(tell_me_a_joke);
// printf("commands: tell me a joke, size:%d\n", data_size);
// } else if(strcmp(lang, ESP_MN_CHINESE) == 0) {
// data = (unsigned char*)da_kai_kong_tiao;
// data_size = sizeof(da_kai_kong_tiao);
// printf("commands: da kai kong tiao, size:%d\n", data_size);
// }
// int16_t *buffer = (int16_t *) malloc(audio_chunksize);
// int chunks = 0;
// struct timeval tv_start, tv_end;
// gettimeofday(&tv_start, NULL);
// esp_mn_state_t mn_state;
// esp_mn_error_t *error_phrases = NULL;
// esp_mn_commands_clear();
model_iface_data_t *model_data = multinet->create(model_name, 6000);
int frequency = multinet->get_samp_rate(model_data);
int audio_chunksize = multinet->get_samp_chunksize(model_data) * sizeof(int16_t);
char *lang = multinet->get_language(model_data);
esp_mn_commands_update_from_sdkconfig(multinet, model_data);
unsigned char* data = NULL;
size_t data_size = 0;
if (strcmp(lang, ESP_MN_ENGLISH) == 0) {
data = (unsigned char*)tell_me_a_joke;
data_size = sizeof(tell_me_a_joke);
printf("commands: tell me a joke, size:%d\n", data_size);
} else if(strcmp(lang, ESP_MN_CHINESE) == 0) {
data = (unsigned char*)da_kai_kong_tiao;
data_size = sizeof(da_kai_kong_tiao);
printf("commands: da kai kong tiao, size:%d\n", data_size);
}
int16_t *buffer = (int16_t *) malloc(audio_chunksize);
int chunks = 0;
struct timeval tv_start, tv_end;
gettimeofday(&tv_start, NULL);
esp_mn_state_t mn_state;
esp_mn_error_t *error_phrases = NULL;
esp_mn_commands_clear();
// if (strcmp(lang, ESP_MN_ENGLISH) == 0) {
// esp_mn_commands_phoneme_add(1, "TELL ME A JOKE", "TfL Mm c qbK");
// esp_mn_commands_phoneme_add(2, "SING A SONG", "Sgl c Sel");
// error_phrases = esp_mn_commands_update();
// } else if(strcmp(lang, ESP_MN_CHINESE) == 0) {
// esp_mn_commands_add(1, "da kai kong tiao");
// esp_mn_commands_add(2, "guan bi kong tiao");
// error_phrases = esp_mn_commands_update();
// } else {
// printf("Invalid language\n");
// }
// multinet->print_active_speech_commands(model_data);
if (strcmp(lang, ESP_MN_ENGLISH) == 0) {
esp_mn_commands_phoneme_add(1, "TELL ME A JOKE", "TfL Mm c qbK");
esp_mn_commands_phoneme_add(2, "SING A SONG", "Sgl c Sel");
error_phrases = esp_mn_commands_update();
} else if(strcmp(lang, ESP_MN_CHINESE) == 0) {
esp_mn_commands_add(1, "da kai kong tiao");
esp_mn_commands_add(2, "guan bi kong tiao");
error_phrases = esp_mn_commands_update();
} else {
printf("Invalid language\n");
}
multinet->print_active_speech_commands(model_data);
while (1) {
if ((chunks + 1)*audio_chunksize <= data_size) {
memcpy(buffer, data + chunks * audio_chunksize, audio_chunksize);
} else {
memset(buffer, 0, audio_chunksize);
}
mn_state = multinet->detect(model_data, buffer);
if (mn_state == ESP_MN_STATE_DETECTED) {
esp_mn_results_t *mn_result = multinet->get_results(model_data);
if (mn_result->num > 0)
printf("detected: command id:%d, string:%s\n",mn_result->command_id[0], mn_result->string);
else
printf("timeout\n");
break;
}
chunks++;
if (chunks > 600)
break;
}
gettimeofday(&tv_end, NULL);
int tv_ms=(tv_end.tv_sec-tv_start.tv_sec)*1000+(tv_end.tv_usec-tv_start.tv_usec)/1000;
chunks -= 7;
int run_ms = (chunks)*audio_chunksize/sizeof(int16_t)*1000/frequency;
printf("Done! Took %d ms to parse %d ms worth of samples in %d iterations. CPU loading(single core):%.1f%%\n",
tv_ms, run_ms, chunks, tv_ms*100.0/run_ms);
multinet->destroy(model_data);
esp_srmodel_deinit(models);
TEST_ASSERT_EQUAL(true, error_phrases == NULL);
TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED);
}
TEST_CASE("multinet set commands from sdkconfig and detect", "[mn]")
{
vTaskDelay(500 / portTICK_PERIOD_MS);
srmodel_list_t *models = esp_srmodel_init("model");
char *model_name = esp_srmodel_filter(models, ESP_MN_PREFIX, NULL);
esp_mn_iface_t *multinet = esp_mn_handle_from_name(model_name);
model_iface_data_t *model_data = multinet->create(model_name, 6000);
int frequency = multinet->get_samp_rate(model_data);
int audio_chunksize = multinet->get_samp_chunksize(model_data) * sizeof(int16_t);
char *lang = multinet->get_language(model_data);
esp_mn_commands_update_from_sdkconfig(multinet, model_data);
unsigned char* data = NULL;
size_t data_size = 0;
if (strcmp(lang, ESP_MN_ENGLISH) == 0) {
data = (unsigned char*)tell_me_a_joke;
data_size = sizeof(tell_me_a_joke);
printf("commands: tell me a joke, size:%d\n", data_size);
} else if(strcmp(lang, ESP_MN_CHINESE) == 0) {
data = (unsigned char*)da_kai_kong_tiao;
data_size = sizeof(da_kai_kong_tiao);
printf("commands: da kai kong tiao, size:%d\n", data_size);
}
int16_t *buffer = (int16_t *) malloc(audio_chunksize);
int chunks = 0;
struct timeval tv_start, tv_end;
gettimeofday(&tv_start, NULL);
esp_mn_state_t mn_state;
esp_mn_error_t *error_phrases = NULL;
esp_mn_commands_update_from_sdkconfig(multinet, model_data);
multinet->print_active_speech_commands(model_data);
while (1) {
if ((chunks + 1)*audio_chunksize <= data_size) {
memcpy(buffer, data + chunks * audio_chunksize, audio_chunksize);
} else {
memset(buffer, 0, audio_chunksize);
}
mn_state = multinet->detect(model_data, buffer);
if (mn_state == ESP_MN_STATE_DETECTED) {
esp_mn_results_t *mn_result = multinet->get_results(model_data);
if (mn_result->num > 0)
printf("detected: command id:%d, string:%s\n",mn_result->command_id[0], mn_result->string);
else
printf("timeout\n");
break;
}
chunks++;
if (chunks > 600)
break;
}
gettimeofday(&tv_end, NULL);
int tv_ms=(tv_end.tv_sec-tv_start.tv_sec)*1000+(tv_end.tv_usec-tv_start.tv_usec)/1000;
chunks -= 7;
int run_ms = (chunks)*audio_chunksize/sizeof(int16_t)*1000/frequency;
printf("Done! Took %d ms to parse %d ms worth of samples in %d iterations. CPU loading(single core):%.1f%%\n",
tv_ms, run_ms, chunks, tv_ms*100.0/run_ms);
multinet->destroy(model_data);
esp_srmodel_deinit(models);
TEST_ASSERT_EQUAL(true, error_phrases == NULL);
TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED);
}
// while (1) {
// if ((chunks + 1)*audio_chunksize <= data_size) {
// memcpy(buffer, data + chunks * audio_chunksize, audio_chunksize);
// } else {
// memset(buffer, 0, audio_chunksize);
// }
// mn_state = multinet->detect(model_data, buffer);
// if (mn_state == ESP_MN_STATE_DETECTED) {
// esp_mn_results_t *mn_result = multinet->get_results(model_data);
// if (mn_result->num > 0)
// printf("detected: command id:%d, string:%s\n",mn_result->command_id[0], mn_result->string);
// else
// printf("timeout\n");
// break;
// }
// chunks++;
// if (chunks > 600)
// break;
// }
// gettimeofday(&tv_end, NULL);
// int tv_ms=(tv_end.tv_sec-tv_start.tv_sec)*1000+(tv_end.tv_usec-tv_start.tv_usec)/1000;
// chunks -= 7;
// int run_ms = (chunks)*audio_chunksize/sizeof(int16_t)*1000/frequency;
// printf("Done! Took %d ms to parse %d ms worth of samples in %d iterations. CPU loading(single core):%.1f%%\n",
// tv_ms, run_ms, chunks, tv_ms*100.0/run_ms);
// multinet->destroy(model_data);
// esp_srmodel_deinit(models);
// TEST_ASSERT_EQUAL(true, error_phrases == NULL);
// TEST_ASSERT_EQUAL(true, mn_state == ESP_MN_STATE_DETECTED);
// }
TEST_CASE("multinet set commands", "[mn]")
{