Merge branch 'feat/modify_commands' into 'master'

Update multinet API to add/modify/print/check new commands See merge request speech-recognition-framework/esp-sr!37
2025-09-15 15:28:44 +08:00 · 2023-05-10 20:32:28 +08:00 · 2023-05-10 20:32:28 +08:00 · 31b8cb660e
commit 31b8cb660e
parent 6109ee353e baf24c4b4e
21 changed files with 373 additions and 98 deletions
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@ -58,7 +58,8 @@ build_esp_sr_pdf:
  script:
    - cd $DOCS_DIR
    - ./check_lang_folder_sync.sh
-    - build-docs -bs latex -l $DOCLANG -t $DOCTGT
+    - pip install -r requirements.txt
+    - build-docs --skip-reqs-check -bs latex -l $DOCLANG -t $DOCTGT
  parallel:
    matrix:
      - DOCLANG: ["en", "zh_CN"]
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -2,12 +2,13 @@

 ## Known issues: 
 - Available storage is less than the remaining flash space on IDF v5.0.   
-If you can not map model partition successfully, please check the left free storage by `spi_flash_mmap_get_free_pages(ESP_PARTITION_MMAP_DATA)`
+If you can not map model partition successfully, please check the left free storage by `spi_flash_mmap_get_free_pages(ESP_PARTITION_MMAP_DATA)` or update IDF to v5.1 or later.

-## unreleased
+## 1.3.1
 - Bugfix: remove all cxx11:string
- Bugfix: remove esp-partition require for esp32s2 & esp32c3 on idf v4.4
- Add more loader option for multinet to blance CPU and memory consumption
+- Bugfix: remove esp-partition for esp32s2 & esp32c3 on idf v4.4
+- Update multinet API to add/modify/check new commands in the code
+- Update documents to introduce how to use multinet API

 ## 1.3.0 
 - Update the partition APIs to keep compatible with both IDF v4.4 and IDF v5.0
--- a/Kconfig.projbuild
+++ b/Kconfig.projbuild
@ -153,6 +153,7 @@ choice CHINESE_SR_MN_MODEL_SEL
    config SR_MN_CN_MULTINET6_AC_QUANT
        bool "chinese recognition for air conditioner controller (mn6_cn_ac)"
        depends on IDF_TARGET_ESP32S3
+
 endchoice

 choice ENGLISH_SR_MN_MODEL_SEL
--- a/docs/en/speech_command_recognition/README.rst
+++ b/docs/en/speech_command_recognition/README.rst
@ -68,9 +68,8 @@ MultiNet5 customize speech commands
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

 MultiNet5 use phonemes for English speech commands. For simplicity, we use characters to denote different phonemes. Please use :project_file:`tool/multinet_g2p.py` to do the convention.  
-There are two methods to customize speech commands offline:

-  Via ``menuconfig``
+- Via ``menuconfig``

    1. Navigate to ``idf.py menuconfig`` > ``ESP Speech Recognition`` > ``Add Chinese speech commands/Add English speech commands`` to add speech commands. For details, please refer to the example in ESP-Skainet.

@ -86,19 +85,122 @@ There are two methods to customize speech commands offline:
    ::

        /**
-        * @brief Update the speech commands of MultiNet by menuconfig
+            * @brief Update the speech commands of MultiNet by menuconfig
+            *
+            * @param multinet            The multinet handle
+            *
+            * @param model_data          The model object to query
+            *
+            * @param langugae            The language of MultiNet
+            *
+            * @return
+            *     - ESP_OK                  Success
+            *     - ESP_ERR_INVALID_STATE   Fail
+            */
+            esp_err_t esp_mn_commands_update_from_sdkconfig(esp_mn_iface_t *multinet, const model_iface_data_t *model_data);
+
+Customize Speech Commands Via API calls
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+Alternatively, speech commands can be modified via API calls, this method works for both MultiNet5 and MultiNet6.
+
+- Print active speech commands, this function will print out all speech commands that are active.
+
+    ::
+
+        /**
+        * @brief Update the speech commands of MultiNet
+        * 
+        * @Warning: Must be used after [add/remove/modify/clear] function, 
+        *           otherwise the language model of multinet can not be updated.
        *
        * @param multinet            The multinet handle
-        *
        * @param model_data          The model object to query
        *
-        * @param langugae            The language of MultiNet
+        * @return
+        *     - NULL                 Success
+        *     - others               The list of error phrase which can not be parsed by multinet.
+        */
+        esp_mn_error_t *esp_mn_commands_update();
+
+    .. note::
+        The modifications will not be applied, thus not printed out, until you call ``esp_mn_commands_update()``.
+
+- Apply new changes, the add/remove/modify/clear actions will not take effect util this function is called.
+
+    ::
+
+        /**
+        * @brief Update the speech commands of MultiNet
+        * 
+        * @Warning: Must be used after [add/remove/modify/clear] function, 
+        *           otherwise the language model of multinet can not be updated.
+        *
+        * @return
+        *     - NULL                 Success
+        *     - others               The list of error phrase which can not be parsed by multinet.
+        */
+        esp_mn_error_t *esp_mn_commands_update();
+
+
+- Add a new speech command, will return ``ESP_ERR_INVALID_STATE`` if the input string is not in the correct format.
+
+    ::
+
+        /**
+        * @brief Add one speech commands with command string and command ID
+        *
+        * @param command_id      The command ID
+        * @param string  The command string of the speech commands
        *
        * @return
        *     - ESP_OK                  Success
        *     - ESP_ERR_INVALID_STATE   Fail
        */
-        esp_err_t esp_mn_commands_update_from_sdkconfig(esp_mn_iface_t *multinet, const model_iface_data_t *model_data);
+        esp_err_t esp_mn_commands_add(int command_id, char *string);
+
+- Remove a speech command, will return ``ESP_ERR_INVALID_STATE`` if the command does not exist.
+
+    ::
+
+        /**
+        * @brief Remove one speech commands by command string
+        *
+        * @param string  The command string of the speech commands
+        *
+        * @return
+        *     - ESP_OK                  Success
+        *     - ESP_ERR_INVALID_STATE   Fail
+        */
+        esp_err_t esp_mn_commands_remove(char *string);
+
+- Modify a speech command, will return ``ESP_ERR_INVALID_STATE`` if the command does not exist.
+
+    ::
+
+        /**
+        * @brief Modify one speech commands with new command string
+        *
+        * @param old_string  The old command string of the speech commands
+        * @param new_string  The new command string of the speech commands
+        *
+        * @return
+        *     - ESP_OK                  Success
+        *     - ESP_ERR_INVALID_STATE   Fail
+        */
+        esp_err_t esp_mn_commands_modify(char *old_string, char *new_string);
+
+- Clear all speech commands.
+
+    ::
+
+        /**
+        * @brief Clear all speech commands in linked list
+        *
+        * @return
+        *     - ESP_OK                  Success
+        *     - ESP_ERR_INVALID_STATE   Fail
+        */
+        esp_err_t esp_mn_commands_clear(void);

 Use MultiNet
 ------------
--- a/docs/zh_CN/speech_command_recognition/README.rst
+++ b/docs/zh_CN/speech_command_recognition/README.rst
@ -110,6 +110,109 @@ MultiNet5 定义方法：
        esp_err_t esp_mn_commands_update_from_sdkconfig(esp_mn_iface_t *multinet, const model_iface_data_t *model_data);


+通过调用 API 修改
+~~~~~~~~~~~~~~~~~
+指令还可以通过调用 API 修改，这种方法对于 MultiNet5 和 MultiNet6 都适用。
+
+- 打印现有指令。
+
+    ::
+
+        /**
+        * @brief Update the speech commands of MultiNet
+        * 
+        * @Warning: Must be used after [add/remove/modify/clear] function, 
+        *           otherwise the language model of multinet can not be updated.
+        *
+        * @param multinet            The multinet handle
+        * @param model_data          The model object to query
+        *
+        * @return
+        *     - NULL                 Success
+        *     - others               The list of error phrase which can not be parsed by multinet.
+        */
+        esp_mn_error_t *esp_mn_commands_update();
+
+    .. note::
+        所有修改操作在调用 ``esp_mn_commands_update()`` 后才会被打印出来。
+
+- 应用新的修改操作，所有添加、移除、修改及清空操作在调用后才会被应用。
+
+    ::
+
+        /**
+        * @brief Update the speech commands of MultiNet
+        * 
+        * @Warning: Must be used after [add/remove/modify/clear] function, 
+        *           otherwise the language model of multinet can not be updated.
+        *
+        * @return
+        *     - NULL                 Success
+        *     - others               The list of error phrase which can not be parsed by multinet.
+        */
+        esp_mn_error_t *esp_mn_commands_update();
+
+
+- 添加一条新指令，如果指令格式不正确则返回 ``ESP_ERR_INVALID_STATE``。
+
+    ::
+
+        /**
+        * @brief Add one speech commands with command string and command ID
+        *
+        * @param command_id      The command ID
+        * @param string  The command string of the speech commands
+        *
+        * @return
+        *     - ESP_OK                  Success
+        *     - ESP_ERR_INVALID_STATE   Fail
+        */
+        esp_err_t esp_mn_commands_add(int command_id, char *string);
+
+- 移除一条指令，如果该指令不存在则返回 ``ESP_ERR_INVALID_STATE``。
+
+    ::
+
+        /**
+        * @brief Remove one speech commands by command string
+        *
+        * @param string  The command string of the speech commands
+        *
+        * @return
+        *     - ESP_OK                  Success
+        *     - ESP_ERR_INVALID_STATE   Fail
+        */
+        esp_err_t esp_mn_commands_remove(char *string);
+
+- 修改一条指令，如果该指令不存在则返回 ``ESP_ERR_INVALID_STATE``。
+
+    ::
+
+        /**
+        * @brief Modify one speech commands with new command string
+        *
+        * @param old_string  The old command string of the speech commands
+        * @param new_string  The new command string of the speech commands
+        *
+        * @return
+        *     - ESP_OK                  Success
+        *     - ESP_ERR_INVALID_STATE   Fail
+        */
+        esp_err_t esp_mn_commands_modify(char *old_string, char *new_string);
+
+- 清空所有指令。
+
+    ::
+
+        /**
+        * @brief Clear all speech commands in linked list
+        *
+        * @return
+        *     - ESP_OK                  Success
+        *     - ESP_ERR_INVALID_STATE   Fail
+        */
+        esp_err_t esp_mn_commands_clear(void);
+        
 MultiNet 的使用
 ----------------

--- a/idf_component.yml
+++ b/idf_component.yml
@ -1,4 +1,4 @@
-version: "1.3.0"
+version: "1.3.1"
 description: esp_sr provides basic algorithms for Speech Recognition applications
 url: https://github.com/espressif/esp-sr
 dependencies:
--- a/include/esp32s3/esp_mn_iface.h
+++ b/include/esp32s3/esp_mn_iface.h
@ -3,7 +3,7 @@
 #include "esp_wn_iface.h"

 #define ESP_MN_RESULT_MAX_NUM 5
-#define ESP_MN_MAX_PHRASE_NUM 200
+#define ESP_MN_MAX_PHRASE_NUM 400
 #define ESP_MN_MAX_PHRASE_LEN 63
 #define ESP_MN_MIN_PHRASE_LEN 2

@ -12,8 +12,8 @@
 #define ESP_MN_CHINESE "cn"

 typedef enum {
-	ESP_MN_STATE_DETECTING = 0,     // detecting
-	ESP_MN_STATE_DETECTED = 1,      // detected
+    ESP_MN_STATE_DETECTING = 0,     // detecting
+    ESP_MN_STATE_DETECTED = 1,      // detected
    ESP_MN_STATE_TIMEOUT = 2,       // time out
 } esp_mn_state_t;

@ -21,20 +21,20 @@ typedef enum {
 //The memory comsumption is decreased with increasing mode, 
 //As a consequence also the CPU loading rate goes up
 typedef enum {
-	ESP_MN_LOAD_FROM_PSRAM = 0,          // Load all weights from PSRAM. Fastest computation with Maximum memory consumption
-	ESP_MN_LOAD_FROM_PSRAM_FLASH = 1,    // Load some weights from PSRAM and laod the rest from FLASH (default)
+    ESP_MN_LOAD_FROM_PSRAM = 0,          // Load all weights from PSRAM. Fastest computation with Maximum memory consumption
+    ESP_MN_LOAD_FROM_PSRAM_FLASH = 1,    // Load some weights from PSRAM and laod the rest from FLASH (default)
    ESP_MN_LOAD_FROM_FLASH = 2,          // Load more weights from FLASH. Minimum memory consumption with slowest computation
 } esp_mn_loader_mode_t;

 typedef enum {
-	ESP_MN_GREEDY_SEARCH = 0,          // greedy search
-	ESP_MN_BEAM_SEARCH = 1,            // beam search
+    ESP_MN_GREEDY_SEARCH = 0,          // greedy search
+    ESP_MN_BEAM_SEARCH = 1,            // beam search
    ESP_MN_BEAM_SEARCH_WITH_FST = 2,  // beam search with trie language model
 } esp_mn_search_method_t;

 typedef enum {
-	CHINESE_ID = 1,       // Chinese language
-	ENGLISH_ID = 2,       // English language
+    CHINESE_ID = 1,       // Chinese language
+    ENGLISH_ID = 2,       // English language
 } language_id_t;

 // Return all possible recognition results
@ -47,17 +47,11 @@ typedef struct{
    char string[256];
 } esp_mn_results_t;

-
-typedef struct{
-    int16_t num;                                // The number of error phrases, which can not added into model
-    int16_t phrase_idx[ESP_MN_MAX_PHRASE_NUM];  // The error phrase index in singly linked list．
-} esp_mn_error_t;
-
 typedef struct {
-    char phoneme_string[ESP_MN_MAX_PHRASE_LEN + 1];  // phoneme string
-    int16_t command_id;                              // the command id
-    float threshold;                                 // trigger threshold, default: 0
-    int16_t *wave;                                   // prompt wave data of the phrase
+    char *string;                               // command string
+    int16_t command_id;                         // the command id
+    float threshold;                            // trigger threshold, default: 0
+    int16_t *wave;                              // prompt wave data of the phrase
 } esp_mn_phrase_t;

 typedef struct _mn_node_ {
@ -65,6 +59,11 @@ typedef struct _mn_node_ {
    struct _mn_node_ *next;
 } esp_mn_node_t;

+typedef struct{
+    int16_t num;                                // The number of error phrases, which can not added into model
+    esp_mn_phrase_t **phrases;                  // The array of error phrase pointer
+} esp_mn_error_t;
+
 /**
 * @brief Initialze a model instance with specified model name.
 *
@ -181,6 +180,22 @@ typedef void (*esp_mn_iface_op_clean_t)(model_iface_data_t *model_data);
 */
 typedef esp_mn_error_t* (*esp_wn_iface_op_set_speech_commands)(model_iface_data_t *model_data, esp_mn_node_t *mn_command_root);

+
+/**
+ * @brief Print out current commands in fst, note the ones "added" but not "updated" will not be shown here
+ * 
+ * @param model_data     The model object to query
+*/
+typedef void (*esp_mn_iface_op_print_active_speech_commands)(model_iface_data_t *model_data);
+
+/**
+ * @brief Check if input string can be tokenized
+ * 
+ * @param model_data     The model object to query
+ * @param str            The input string
+*/
+typedef int (*esp_mn_iface_op_check_speech_command)(model_iface_data_t *model_data, char *str);
+
 typedef struct {
    esp_mn_iface_op_create_t create;
    esp_mn_iface_op_get_samp_rate_t get_samp_rate;
@ -195,4 +210,6 @@ typedef struct {
    esp_mn_iface_op_clean_t clean;
    esp_wn_iface_op_set_speech_commands set_speech_commands;
    esp_mn_iface_op_switch_loader_mode_t switch_loader_mode;
+    esp_mn_iface_op_print_active_speech_commands print_active_speech_commands;
+    esp_mn_iface_op_check_speech_command check_speech_command;
 } esp_mn_iface_t;
--- a/lib/esp32s3/libesp_audio_front_end.a
+++ b/lib/esp32s3/libesp_audio_front_end.a
--- a/lib/esp32s3/libfst.a
+++ b/lib/esp32s3/libfst.a
--- a/lib/esp32s3/libmultinet.a
+++ b/lib/esp32s3/libmultinet.a
--- a/lib/esp32s3/libwakenet.a
+++ b/lib/esp32s3/libwakenet.a
--- a/model/multinet_model/mn6_cn/_MODEL_INFO_
+++ b/model/multinet_model/mn6_cn/_MODEL_INFO_
@ -1,2 +1,2 @@
 # (neural network type)_(model data version)_(lable1_detection windown length_threshold for 90%_threshold for 95%)_(lable2 ...)_...
-MN6_v1_chinese_8_0.9_0.90
+MN6_v3_chinese_8_0.9_0.90
--- a/model/multinet_model/mn6_cn/mn6_data
+++ b/model/multinet_model/mn6_cn/mn6_data
--- a/model/multinet_model/mn6_cn/mn6_index
+++ b/model/multinet_model/mn6_cn/mn6_index
--- a/model/multinet_model/mn6_cn_ac/mn6_data
+++ b/model/multinet_model/mn6_cn_ac/mn6_data
--- a/model/multinet_model/mn6_cn_ac/mn6_index
+++ b/model/multinet_model/mn6_cn_ac/mn6_index
--- a/src/esp_mn_speech_commands.c
+++ b/src/esp_mn_speech_commands.c
@ -4,9 +4,13 @@
 #include "esp_log.h"
 #include "esp_heap_caps.h"
 #include "esp_mn_speech_commands.h"
+#include "esp_mn_iface.h"

 static char *TAG = "MN_COMMAND";
 static esp_mn_node_t *esp_mn_root = NULL;
+static esp_mn_iface_t *esp_mn_model_handle = NULL;
+static model_iface_data_t *esp_mn_model_data = NULL;
+

 #define ESP_RETURN_ON_FALSE(a, err_code, log_tag, format, ...) do {                             \
        if (!(a)) {                                                                             \
@ -15,10 +19,14 @@ static esp_mn_node_t *esp_mn_root = NULL;
        }                                                                                       \
    } while(0)

-esp_err_t esp_mn_commands_alloc(void)
+esp_err_t esp_mn_commands_alloc(esp_mn_iface_t *multinet, model_iface_data_t *model_data)
 {
-    ESP_RETURN_ON_FALSE(NULL == esp_mn_root, ESP_ERR_INVALID_STATE, TAG, "The mn commands already initialized");
+    if (esp_mn_root != NULL) {
+        esp_mn_commands_free();
+    }
    esp_mn_root = esp_mn_node_alloc(NULL);
+    esp_mn_model_handle = multinet;
+    esp_mn_model_data = model_data;
    return ESP_OK;
 }

@ -27,6 +35,8 @@ esp_err_t esp_mn_commands_free(void)
    esp_mn_commands_clear();
    esp_mn_node_free(esp_mn_root);
    esp_mn_root = NULL;
+    esp_mn_model_handle = NULL;
+    esp_mn_model_data = NULL;

    return ESP_OK;
 }
@ -57,14 +67,53 @@ esp_err_t esp_mn_commands_clear(void)
    return ESP_OK;
 }

-esp_err_t esp_mn_commands_add(int command_id, char *phoneme_string)
-{
+esp_mn_node_t *esp_mn_command_search(char *string) {
+    int command_id;
    esp_mn_node_t *temp = esp_mn_root;
    ESP_RETURN_ON_FALSE(NULL != esp_mn_root, ESP_ERR_INVALID_STATE, TAG, "The mn commands is not initialized");
-    int last_node_elem_num = esp_mn_commands_num();
-    ESP_RETURN_ON_FALSE(ESP_MN_MAX_PHRASE_NUM >= last_node_elem_num, ESP_ERR_INVALID_STATE, TAG, "The number of speech commands phrase must less than 200");

-    esp_mn_phrase_t *phrase = esp_mn_phrase_alloc(command_id, phoneme_string);
+    while (temp->next) {
+        temp = temp->next;
+        if (strcmp(string, temp->phrase->string) == 0) {
+            return temp;
+        }
+    }
+    return NULL;
+}
+
+esp_err_t esp_mn_commands_add(int command_id, char *string)
+{
+    if (NULL == esp_mn_root || esp_mn_model_handle == NULL || esp_mn_model_data == NULL) {
+        ESP_LOGE(TAG, "Please create mn model first.\n");
+        return ESP_ERR_INVALID_STATE;
+    }
+    esp_mn_node_t *temp = esp_mn_root;
+    int last_node_elem_num = esp_mn_commands_num();
+    ESP_RETURN_ON_FALSE(ESP_MN_MAX_PHRASE_NUM >= last_node_elem_num, ESP_ERR_INVALID_STATE, TAG, "The number of speech commands exceed ESP_MN_MAX_PHRASE_NUM");
+
+    if (esp_mn_model_handle->check_speech_command(esp_mn_model_data, string) == 0) {
+        // error message is printed inside check_speech_command
+        ESP_LOGE(TAG, "invalid command, please check format, %s.\n", string);
+        return ESP_ERR_INVALID_STATE;
+    }
+
+    temp = esp_mn_command_search(string);
+
+    if (temp != NULL) {
+        // command already exists
+        if (command_id != temp->phrase->command_id) {
+            // change command id
+            temp->phrase->command_id = command_id;
+        } else {
+            // it's exactly the same, do nothing
+            ESP_LOGI(TAG, "command %d: (%s) already exists.", command_id, string);
+        }
+        return ESP_OK;
+    }
+
+    temp = esp_mn_root;
+
+    esp_mn_phrase_t *phrase = esp_mn_phrase_alloc(command_id, string);
    if (phrase == NULL) {
        return ESP_ERR_INVALID_STATE;
    }
@ -78,48 +127,43 @@ esp_err_t esp_mn_commands_add(int command_id, char *phoneme_string)
    return ESP_OK;
 }

-esp_err_t esp_mn_commands_modify(char *old_phoneme_string, char *new_phoneme_string)
+esp_err_t esp_mn_commands_modify(char *old_string, char *new_string)
 {
+    if (esp_mn_model_handle->check_speech_command(esp_mn_model_data, new_string) == 0) {
+        // error message is printed inside check_speech_command
+        return ESP_ERR_INVALID_STATE;
+    }
    esp_mn_node_t *temp = esp_mn_root;
    ESP_RETURN_ON_FALSE(NULL != esp_mn_root, ESP_ERR_INVALID_STATE, TAG, "The mn commands is not initialized");

-    // search old phoneme_string to get command id
-    bool flag = false;
-    int command_id;
-    while (temp->next) {
-        temp = temp->next;
-        if (strcmp(old_phoneme_string, temp->phrase->phoneme_string) == 0) {
-            command_id = temp->phrase->command_id;
-            flag = true;
-            break;
-        }
-    }
+    // search old string to get command id
+    temp = esp_mn_command_search(old_string);

    // replace old phrase with new phrase
-    if (flag) {
-        esp_mn_phrase_t *phrase = esp_mn_phrase_alloc(command_id, new_phoneme_string);
+    if (temp != NULL) {
+        esp_mn_phrase_t *phrase = esp_mn_phrase_alloc(temp->phrase->command_id, new_string);
        if (phrase == NULL) {
            return ESP_ERR_INVALID_STATE;
        }
        esp_mn_phrase_free(temp->phrase);
        temp->phrase = phrase;
    } else {
-        ESP_LOGE(TAG, "No such speech command: \"%s\"", old_phoneme_string);
+        ESP_LOGE(TAG, "No such speech command: \"%s\"", old_string);
        return ESP_ERR_INVALID_STATE;
    }

    return ESP_OK;
 }

-esp_err_t esp_mn_commands_remove(char *phoneme_string)
+esp_err_t esp_mn_commands_remove(char *string)
 {
    esp_mn_node_t *temp = esp_mn_root;
    ESP_RETURN_ON_FALSE(NULL != esp_mn_root, ESP_ERR_INVALID_STATE, TAG, "The mn commands is not initialized");

-    // search phoneme_string to get node point
+    // search string to get node point
    bool flag = false;
    while (temp->next) {
-        if (strcmp(phoneme_string, temp->next->phrase->phoneme_string) == 0) {
+        if (strcmp(string, temp->next->phrase->string) == 0) {
            flag = true;
            break;
        }
@ -132,7 +176,7 @@ esp_err_t esp_mn_commands_remove(char *phoneme_string)
        esp_mn_node_free(rm_node);
        return ESP_OK;
    } else {
-        ESP_LOGE(TAG, "No such speech command: \"%s\"", phoneme_string);
+        ESP_LOGE(TAG, "No such speech command: \"%s\"", string);
        return ESP_ERR_INVALID_STATE;
    }

@ -155,14 +199,14 @@ esp_mn_phrase_t *esp_mn_commands_get_from_index(int index)
    return temp->phrase;
 }

-esp_mn_phrase_t *esp_mn_commands_get_from_string(const char *phoneme_string)
+esp_mn_phrase_t *esp_mn_commands_get_from_string(const char *string)
 {
    ESP_RETURN_ON_FALSE(NULL != esp_mn_root, NULL, TAG, "The mn commands is not initialized");

    // phrase index also is phrase id, which is the depth from this phrase node to root node
    esp_mn_node_t *temp = esp_mn_root;
    while (temp->next) {
-        if (strcmp(phoneme_string, temp->next->phrase->phoneme_string) == 0) {
+        if (strcmp(string, temp->next->phrase->string) == 0) {
            return temp->next->phrase;
        }
        temp = temp->next;
@ -171,10 +215,10 @@ esp_mn_phrase_t *esp_mn_commands_get_from_string(const char *phoneme_string)
    return NULL;
 }

-esp_mn_error_t *esp_mn_commands_update(const esp_mn_iface_t *multinet, model_iface_data_t *model_data)
+esp_mn_error_t *esp_mn_commands_update()
 {
    ESP_RETURN_ON_FALSE(NULL != esp_mn_root, NULL, TAG, "The mn commands is not initialize");
-    esp_mn_error_t *error = multinet->set_speech_commands(model_data, esp_mn_root);
+    esp_mn_error_t *error = esp_mn_model_handle->set_speech_commands(esp_mn_model_data, esp_mn_root);

    if (error->num == 0) {
        return NULL;
@ -190,7 +234,7 @@ void esp_mn_commands_print(void)
    int phrase_id = 0;
    while (temp->next) {
        temp = temp->next;
-        ESP_LOGI(TAG, "Command ID%d, phrase ID%d: %s", temp->phrase->command_id, phrase_id, temp->phrase->phoneme_string);
+        ESP_LOGI(TAG, "Command ID%d, phrase ID%d: %s", temp->phrase->command_id, phrase_id, temp->phrase->string);
        phrase_id++;
    }
    ESP_LOGI(TAG, "---------------------------------------------------------\n");
@ -205,21 +249,21 @@ void *_esp_mn_calloc_(int n, int size)
 #endif
 }

-esp_mn_phrase_t *esp_mn_phrase_alloc(int command_id, char *phoneme_string)
+esp_mn_phrase_t *esp_mn_phrase_alloc(int command_id, char *string)
 {

-    int phoneme_string_len = strlen(phoneme_string);
-    if (phoneme_string_len > ESP_MN_MAX_PHRASE_LEN || phoneme_string_len < 1) {
-        ESP_LOGE(TAG, "The Length of \"%s\" > ESP_MN_MAX_PHRASE_LEN", phoneme_string);
-        return NULL;
-    }
+    int string_len = strlen(string);
+    ESP_RETURN_ON_FALSE( string_len > 0, NULL, TAG, "input string is empty");

    esp_mn_phrase_t *phrase = _esp_mn_calloc_(1, sizeof(esp_mn_phrase_t));
    ESP_RETURN_ON_FALSE(NULL != phrase, NULL, TAG, "Fail to alloc mn phrase");
+
+    phrase->string = malloc((string_len+1) * sizeof(char));
+    memcpy(phrase->string, string, string_len);
+    phrase->string[string_len] = '\0';
    phrase->command_id = command_id;
    phrase->threshold = 0;
    phrase->wave = NULL;
-    memcpy(phrase->phoneme_string, phoneme_string, phoneme_string_len);

    return phrase;
 }
@ -227,6 +271,12 @@ esp_mn_phrase_t *esp_mn_phrase_alloc(int command_id, char *phoneme_string)
 void esp_mn_phrase_free(esp_mn_phrase_t *phrase)
 {
    if (phrase != NULL) {
+        if (phrase->wave != NULL) {
+            free(phrase->wave);
+        }
+        if (phrase->string != NULL) {
+            free(phrase->string);
+        }
        free(phrase);
    }
 }
--- a/src/esp_process_sdkconfig.c
+++ b/src/esp_process_sdkconfig.c
@ -876,11 +876,11 @@ char *get_id_name_en(int i)

 esp_mn_error_t *esp_mn_commands_update_from_sdkconfig(const esp_mn_iface_t *multinet,  model_iface_data_t *model_data)
 {
-#if defined CONFIG_SR_MN_CN_MULTINET6_QUANT || defined CONFIG_SR_MN_EN_MULTINET6_QUANT
+#if defined CONFIG_SR_MN_CN_MULTINET6_QUANT || defined CONFIG_SR_MN_EN_MULTINET6_QUANT || defined CONFIG_SR_MN_CN_MULTINET6_AC_QUANT
    return NULL;
 #endif

-    esp_mn_commands_alloc();
+    esp_mn_commands_alloc(multinet, model_data);
    printf("esp_mn_commands_update_from_sdkconfig\n");
    int total_phrase_num = 0;
    int language_id = 1; // 0: Chinese, 1:English
@ -939,4 +939,4 @@ end:
    esp_mn_commands_print();

    return esp_mn_commands_update(multinet, model_data);
-}
+}
--- a/src/include/esp_mn_speech_commands.h
+++ b/src/include/esp_mn_speech_commands.h
@ -29,7 +29,7 @@ It is easy to add one speech command into linked list and remove one speech comm
 *     - ESP_ERR_NO_MEM          No memory
 *     - ESP_ERR_INVALID_STATE   The Speech Commands link has been initialized
 */
-esp_err_t esp_mn_commands_alloc(void);
+esp_err_t esp_mn_commands_alloc(esp_mn_iface_t *multinet, model_iface_data_t *model_data);

 /**
 * @brief Clear the speech commands linked list and free root node.
@ -41,39 +41,39 @@ esp_err_t esp_mn_commands_alloc(void);
 esp_err_t esp_mn_commands_free(void);

 /**
- * @brief Add one speech commands with phoneme string and command ID
+ * @brief Add one speech commands with command string and command ID
 *
 * @param command_id      The command ID
- * @param phoneme_string  The phoneme string of the speech commands
+ * @param string  The command string of the speech commands
 *
 * @return
 *     - ESP_OK                  Success
 *     - ESP_ERR_INVALID_STATE   Fail
 */
-esp_err_t esp_mn_commands_add(int command_id, char *phoneme_string);
+esp_err_t esp_mn_commands_add(int command_id, char *string);

 /**
- * @brief Modify one speech commands with new phoneme string
+ * @brief Modify one speech commands with new command string
 *
- * @param old_phoneme_string  The old phoneme string of the speech commands
- * @param new_phoneme_string  The new phoneme string of the speech commands
+ * @param old_string  The old command string of the speech commands
+ * @param new_string  The new command string of the speech commands
 *
 * @return
 *     - ESP_OK                  Success
 *     - ESP_ERR_INVALID_STATE   Fail
 */
-esp_err_t esp_mn_commands_modify(char *old_phoneme_string, char *new_phoneme_string);
+esp_err_t esp_mn_commands_modify(char *old_string, char *new_string);

 /**
- * @brief Remove one speech commands by phoneme string
+ * @brief Remove one speech commands by command string
 *
- * @param phoneme_string  The phoneme string of the speech commands
+ * @param string  The command string of the speech commands
 *
 * @return
 *     - ESP_OK                  Success
 *     - ESP_ERR_INVALID_STATE   Fail
 */
-esp_err_t esp_mn_commands_remove(char *phoneme_string);
+esp_err_t esp_mn_commands_remove(char *string);

 /**
 * @brief Clear all speech commands in linked list
@ -96,40 +96,32 @@ esp_err_t esp_mn_commands_clear(void);
 esp_mn_phrase_t *esp_mn_commands_get_from_index(int index);

 /**
- * @brief Get phrase from phoneme string
+ * @brief Get phrase from command string
 *
 * @return
 *     - esp_mn_phrase_t*        Success
 *     - NULL                    Fail
 */
-esp_mn_phrase_t *esp_mn_commands_get_from_string(const char *phoneme_string);
+esp_mn_phrase_t *esp_mn_commands_get_from_string(const char *string);

 /**
 * @brief Update the speech commands of MultiNet
 * 
 * @Warning: Must be used after [add/remove/modify/clear] function, 
 *           otherwise the language model of multinet can not be updated.
- *
- * @param multinet            The multinet handle
- * @param model_data          The model object to query
- *
+ * 
 * @return
 *     - NULL                 Success
 *     - others               The list of error phrase which can not be parsed by multinet.
 */
-esp_mn_error_t *esp_mn_commands_update(const esp_mn_iface_t *multinet, model_iface_data_t *model_data);
+esp_mn_error_t *esp_mn_commands_update();

 /**
- * @brief Print the MultiNet Speech Commands.
- */
-void esp_mn_print_commands(void);
-
-/**
- * @brief Initialze the esp_mn_phrase_t struct by command id and phoneme string .
+ * @brief Initialze the esp_mn_phrase_t struct by command id and command string .
 *
 * @return the pointer of esp_mn_phrase_t
 */
-esp_mn_phrase_t *esp_mn_phrase_alloc(int command_id, char *phoneme_string);
+esp_mn_phrase_t *esp_mn_phrase_alloc(int command_id, char *string);

 /**
 * @brief Free esp_mn_phrase_t pointer.
--- a/src/include/model_path.h
+++ b/src/include/model_path.h
@ -16,7 +16,9 @@ typedef struct {

 typedef struct {
    char **model_name;                        // the name of models, like "wn9_hilexin"(wakenet9, hilexin), "mn5_en"(multinet5, english)
+#ifdef ESP_PLATFORM
    esp_partition_t *partition;               // partition label used to save the files of model
+#endif
    void * mmap_handle;                       // mmap_handle if using esp_partition_mmap else NULL; 
    int num;                                  // the number of models
    srmodel_data_t **model_data;              // the model data , NULL if spiffs format
@ -75,7 +77,9 @@ int esp_srmodel_exists(srmodel_list_t *models, char *model_name);
 *
 * @return all avaliable models in spiffs,save as srmodel_list_t.
 */
+#ifdef ESP_PLATFORM
 srmodel_list_t *srmodel_spiffs_init(const esp_partition_t *part);
+#endif

 /**
 * @brief unregister SPIFFS filesystem and free srmodel_list_t.
--- a/src/model_path.c
+++ b/src/model_path.c
@ -32,7 +32,9 @@ static srmodel_list_t *srmodel_list_alloc(void)
    models->model_data = NULL;
    models->model_name = NULL;
    models->num = 0;
+#ifdef ESP_PLATFORM
    models->partition = NULL;
+#endif
    models->mmap_handle = NULL;

    return models;
@ -431,7 +433,9 @@ srmodel_list_t *srmodel_sdcard_init(const char *base_path)
            return models;
        } else {
            models->num = model_num;
+#ifdef ESP_PLATFORM
            models->partition = NULL;
+#endif
            models->model_name = malloc(models->num * sizeof(char *));
            for (int i = 0; i < models->num; i++) {
                models->model_name[i] = (char *) calloc(MODEL_NAME_MAX_LENGTH, sizeof(char));