mirror of
https://github.com/espressif/esp-sr.git
synced 2025-09-15 15:28:44 +08:00
224 lines
7.9 KiB
C
224 lines
7.9 KiB
C
#pragma once
|
|
#include "stdint.h"
|
|
#include "esp_wn_iface.h"
|
|
#ifdef __cplusplus
|
|
extern "C" {
|
|
#endif
|
|
|
|
#define ESP_MN_RESULT_MAX_NUM 5
|
|
#define ESP_MN_MAX_PHRASE_NUM 400
|
|
#define ESP_MN_MAX_PHRASE_LEN 63
|
|
#define ESP_MN_MIN_PHRASE_LEN 2
|
|
|
|
#define ESP_MN_PREFIX "mn"
|
|
#define ESP_MN_ENGLISH "en"
|
|
#define ESP_MN_CHINESE "cn"
|
|
|
|
typedef enum {
|
|
ESP_MN_STATE_DETECTING = 0, // detecting
|
|
ESP_MN_STATE_DETECTED = 1, // detected
|
|
ESP_MN_STATE_TIMEOUT = 2, // time out
|
|
} esp_mn_state_t;
|
|
|
|
//Set multinet loading mode
|
|
//The memory comsumption is decreased with increasing mode,
|
|
//As a consequence also the CPU loading rate goes up
|
|
typedef enum {
|
|
ESP_MN_LOAD_FROM_PSRAM = 0, // Load all weights from PSRAM. Fastest computation with Maximum memory consumption
|
|
ESP_MN_LOAD_FROM_PSRAM_FLASH = 1, // Load some weights from PSRAM and laod the rest from FLASH (default)
|
|
ESP_MN_LOAD_FROM_FLASH = 2, // Load more weights from FLASH. Minimum memory consumption with slowest computation
|
|
} esp_mn_loader_mode_t;
|
|
|
|
typedef enum {
|
|
ESP_MN_GREEDY_SEARCH = 0, // greedy search
|
|
ESP_MN_BEAM_SEARCH = 1, // beam search
|
|
ESP_MN_BEAM_SEARCH_WITH_FST = 2, // beam search with trie language model
|
|
} esp_mn_search_method_t;
|
|
|
|
typedef enum {
|
|
CHINESE_ID = 1, // Chinese language
|
|
ENGLISH_ID = 2, // English language
|
|
} language_id_t;
|
|
|
|
// Return all possible recognition results
|
|
typedef struct{
|
|
esp_mn_state_t state;
|
|
int num; // The number of phrase in list, num<=5. When num=0, no phrase is recognized.
|
|
int command_id[ESP_MN_RESULT_MAX_NUM]; // The list of command id.
|
|
int phrase_id[ESP_MN_RESULT_MAX_NUM]; // The list of phrase id.
|
|
float prob[ESP_MN_RESULT_MAX_NUM]; // The list of probability.
|
|
char string[256]; // recognized string with commands graph
|
|
char raw_string[256]; // recognized string without commands graph
|
|
} esp_mn_results_t;
|
|
|
|
typedef struct {
|
|
char *string; // command string
|
|
char *phonemes; // command phonemes, if applicable
|
|
int16_t command_id; // the command id
|
|
float threshold; // trigger threshold, default: 0
|
|
int16_t *wave; // prompt wave data of the phrase
|
|
} esp_mn_phrase_t;
|
|
|
|
typedef struct _mn_node_ {
|
|
esp_mn_phrase_t *phrase;
|
|
struct _mn_node_ *next;
|
|
} esp_mn_node_t;
|
|
|
|
typedef struct{
|
|
int16_t num; // The number of error phrases, which can not added into model
|
|
esp_mn_phrase_t **phrases; // The array of error phrase pointer
|
|
} esp_mn_error_t;
|
|
|
|
/**
|
|
* @brief Initialze a model instance with specified model name.
|
|
*
|
|
* @param model_name The wakenet model name.
|
|
* @param duration The duration (ms) to trigger the timeout
|
|
*
|
|
* @returns Handle to the model data.
|
|
*/
|
|
typedef model_iface_data_t* (*esp_mn_iface_op_create_t)(const char *model_name, int duration);
|
|
|
|
/**
|
|
* @brief Switch multinet mode to change memory consumption and CPU loading
|
|
*
|
|
* @warning Just Support multinet6 or later versions
|
|
*
|
|
* @param model The model object to query
|
|
* @param mode The multinet loader mode
|
|
*
|
|
* @returns Handle to the model data.
|
|
*/
|
|
typedef model_iface_data_t* (*esp_mn_iface_op_switch_loader_mode_t)(model_iface_data_t *model, esp_mn_loader_mode_t mode);
|
|
|
|
/**
|
|
* @brief Callback function type to fetch the amount of samples that need to be passed to the detect function
|
|
*
|
|
* Every speech recognition model processes a certain number of samples at the same time. This function
|
|
* can be used to query that amount. Note that the returned amount is in 16-bit samples, not in bytes.
|
|
*
|
|
* @param model The model object to query
|
|
* @return The amount of samples to feed the detect function
|
|
*/
|
|
typedef int (*esp_mn_iface_op_get_samp_chunksize_t)(model_iface_data_t *model);
|
|
|
|
/**
|
|
* @brief Callback function type to fetch the number of frames recognized by the command word
|
|
*
|
|
* @param model The model object to query
|
|
* @return The number of the frames recognized by the command word
|
|
*/
|
|
typedef int (*esp_mn_iface_op_get_samp_chunknum_t)(model_iface_data_t *model);
|
|
|
|
/**
|
|
* @brief Set the detection threshold to manually abjust the probability
|
|
*
|
|
* @param model The model object to query
|
|
* @param det_treshold The threshold to trigger speech commands, the range of det_threshold is 0.0~0.9999
|
|
*/
|
|
typedef int (*esp_mn_iface_op_set_det_threshold_t)(model_iface_data_t *model, float det_threshold);
|
|
|
|
/**
|
|
* @brief Get the sample rate of the samples to feed to the detect function
|
|
*
|
|
* @param model The model object to query
|
|
* @return The sample rate, in hz
|
|
*/
|
|
typedef int (*esp_mn_iface_op_get_samp_rate_t)(model_iface_data_t *model);
|
|
|
|
/**
|
|
* @brief Get the language of model
|
|
*
|
|
* @param model The language name
|
|
* @return Language name string defined in esp_mn_models.h, eg: ESP_MN_CHINESE, ESP_MN_ENGLISH
|
|
*/
|
|
typedef char * (*esp_mn_iface_op_get_language_t)(model_iface_data_t *model);
|
|
|
|
/**
|
|
* @brief Feed samples of an audio stream to the speech recognition model and detect if there is a speech command found.
|
|
*
|
|
* @param model The model object to query.
|
|
* @param samples An array of 16-bit signed audio samples. The array size used can be queried by the
|
|
* get_samp_chunksize function.
|
|
* @return The state of multinet
|
|
*/
|
|
typedef esp_mn_state_t (*esp_mn_iface_op_detect_t)(model_iface_data_t *model, int16_t *samples);
|
|
|
|
/**
|
|
* @brief Destroy a speech commands recognition model
|
|
*
|
|
* @param model The Model object to destroy
|
|
*/
|
|
typedef void (*esp_mn_iface_op_destroy_t)(model_iface_data_t *model);
|
|
|
|
/**
|
|
* @brief Get recognition results
|
|
*
|
|
* @param model The Model object to query
|
|
*
|
|
* @return The current results.
|
|
*/
|
|
typedef esp_mn_results_t* (*esp_mn_iface_op_get_results_t)(model_iface_data_t *model);
|
|
|
|
/**
|
|
* @brief Open the log print
|
|
*
|
|
* @param model_data The model object to query.
|
|
*
|
|
*/
|
|
typedef void (*esp_mn_iface_op_open_log_t)(model_iface_data_t *model_data);
|
|
|
|
/**
|
|
* @brief Clean all status of model
|
|
*
|
|
* @param model_data The model object to query.
|
|
*
|
|
*/
|
|
typedef void (*esp_mn_iface_op_clean_t)(model_iface_data_t *model_data);
|
|
|
|
/**
|
|
* @brief Set the speech commands by mn_command_root
|
|
*
|
|
* @param model_data The model object to query.
|
|
* @param mn_command_root The speech commands link.
|
|
* @return The error phrase id info.
|
|
*/
|
|
typedef esp_mn_error_t* (*esp_wn_iface_op_set_speech_commands)(model_iface_data_t *model_data, esp_mn_node_t *mn_command_root);
|
|
|
|
|
|
/**
|
|
* @brief Print out current commands in fst, note the ones "added" but not "updated" will not be shown here
|
|
*
|
|
* @param model_data The model object to query
|
|
*/
|
|
typedef void (*esp_mn_iface_op_print_active_speech_commands)(model_iface_data_t *model_data);
|
|
|
|
/**
|
|
* @brief Check if input string can be tokenized
|
|
*
|
|
* @param model_data The model object to query
|
|
* @param str The input string
|
|
*/
|
|
typedef int (*esp_mn_iface_op_check_speech_command)(model_iface_data_t *model_data, const char *str);
|
|
|
|
typedef struct {
|
|
esp_mn_iface_op_create_t create;
|
|
esp_mn_iface_op_get_samp_rate_t get_samp_rate;
|
|
esp_mn_iface_op_get_samp_chunksize_t get_samp_chunksize;
|
|
esp_mn_iface_op_get_samp_chunknum_t get_samp_chunknum;
|
|
esp_mn_iface_op_set_det_threshold_t set_det_threshold;
|
|
esp_mn_iface_op_get_language_t get_language;
|
|
esp_mn_iface_op_detect_t detect;
|
|
esp_mn_iface_op_destroy_t destroy;
|
|
esp_mn_iface_op_get_results_t get_results;
|
|
esp_mn_iface_op_open_log_t open_log;
|
|
esp_mn_iface_op_clean_t clean;
|
|
esp_wn_iface_op_set_speech_commands set_speech_commands;
|
|
esp_mn_iface_op_switch_loader_mode_t switch_loader_mode;
|
|
esp_mn_iface_op_print_active_speech_commands print_active_speech_commands;
|
|
esp_mn_iface_op_check_speech_command check_speech_command;
|
|
} esp_mn_iface_t;
|
|
|
|
#ifdef __cplusplus
|
|
}
|
|
#endif |