/** * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. * MIT License (https://opensource.org/licenses/MIT) */ #pragma once #include #include "cppjieba/DictTrie.hpp" #include "cppjieba/HMMModel.hpp" #include "cppjieba/Jieba.hpp" #include "nlohmann/json.hpp" namespace funasr { class CTokenizer { private: bool m_ready = false; vector m_id2token,m_id2punc; map m_token2id,m_punc2id; cppjieba::DictTrie *jieba_dict_trie_=nullptr; cppjieba::HMMModel *jieba_model_=nullptr; cppjieba::Jieba jieba_processor_; public: CTokenizer(const char* sz_yamlfile); CTokenizer(); ~CTokenizer(); bool OpenYaml(const char* sz_yamlfile); bool OpenYaml(const char* sz_yamlfile, const char* token_file); void ReadYaml(const YAML::Node& node); vector Id2String(vector input); vector String2Ids(vector input); int String2Id(string input); vector Id2Punc(vector input); string Id2Punc(int n_punc_id); vector Punc2Ids(vector input); vector SplitChineseString(const string& str_info); vector SplitChineseJieba(const string& str_info); void StrSplit(const string& str, const char split, vector& res); void Tokenize(const char* str_info, vector& str_out, vector& id_out); bool IsPunc(string& Punc); bool seg_jieba = false; void SetJiebaRes(cppjieba::DictTrie *dict, cppjieba::HMMModel *hmm); void JiebaInit(std::string punc_config); }; } // namespace funasr