From 91231a03f5c16fff0d9d54f859c7a9aa02fd239c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=9B=BE=E8=81=AA?= Date: Mon, 16 Oct 2023 14:47:17 +0800 Subject: [PATCH] add jieba for ct-transformer --- funasr/runtime/onnxruntime/CMakeLists.txt | 2 + .../runtime/onnxruntime/include/com-define.h | 4 + .../onnxruntime/src/ct-transformer.cpp | 1 + funasr/runtime/onnxruntime/src/tokenizer.cpp | 65 ++- funasr/runtime/onnxruntime/src/tokenizer.h | 11 + .../jieba/include/cppjieba/DictTrie.hpp | 274 ++++++++++++ .../jieba/include/cppjieba/FullSegment.hpp | 102 +++++ .../jieba/include/cppjieba/HMMModel.hpp | 129 ++++++ .../jieba/include/cppjieba/HMMSegment.hpp | 197 +++++++++ .../jieba/include/cppjieba/Jieba.hpp | 141 ++++++ .../include/cppjieba/KeywordExtractor.hpp | 154 +++++++ .../jieba/include/cppjieba/MPSegment.hpp | 144 ++++++ .../jieba/include/cppjieba/MixSegment.hpp | 113 +++++ .../jieba/include/cppjieba/PosTagger.hpp | 77 ++++ .../jieba/include/cppjieba/PreFilter.hpp | 54 +++ .../jieba/include/cppjieba/QuerySegment.hpp | 95 ++++ .../jieba/include/cppjieba/SegmentBase.hpp | 46 ++ .../jieba/include/cppjieba/SegmentTagged.hpp | 23 + .../include/cppjieba/TextRankExtractor.hpp | 190 ++++++++ .../jieba/include/cppjieba/Trie.hpp | 200 +++++++++ .../jieba/include/cppjieba/Unicode.hpp | 227 ++++++++++ .../jieba/include/limonp/ArgvContext.hpp | 70 +++ .../jieba/include/limonp/BlockingQueue.hpp | 49 +++ .../include/limonp/BoundedBlockingQueue.hpp | 67 +++ .../jieba/include/limonp/BoundedQueue.hpp | 65 +++ .../jieba/include/limonp/Closure.hpp | 206 +++++++++ .../jieba/include/limonp/Colors.hpp | 31 ++ .../jieba/include/limonp/Condition.hpp | 38 ++ .../jieba/include/limonp/Config.hpp | 103 +++++ .../jieba/include/limonp/FileLock.hpp | 74 ++++ .../jieba/include/limonp/ForcePublic.hpp | 7 + .../jieba/include/limonp/LocalVector.hpp | 139 ++++++ .../jieba/include/limonp/Logging.hpp | 90 ++++ .../third_party/jieba/include/limonp/Md5.hpp | 411 ++++++++++++++++++ .../jieba/include/limonp/MutexLock.hpp | 51 +++ .../jieba/include/limonp/NonCopyable.hpp | 21 + .../jieba/include/limonp/StdExtension.hpp | 157 +++++++ .../jieba/include/limonp/StringUtil.hpp | 405 +++++++++++++++++ .../jieba/include/limonp/Thread.hpp | 44 ++ .../jieba/include/limonp/ThreadPool.hpp | 86 ++++ funasr/runtime/websocket/CMakeLists.txt | 2 + 41 files changed, 4362 insertions(+), 3 deletions(-) create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/DictTrie.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/FullSegment.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/HMMModel.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/HMMSegment.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Jieba.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/KeywordExtractor.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/MPSegment.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/MixSegment.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/PosTagger.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/PreFilter.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/QuerySegment.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/SegmentBase.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/SegmentTagged.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/TextRankExtractor.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Trie.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Unicode.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ArgvContext.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BlockingQueue.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BoundedBlockingQueue.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BoundedQueue.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Closure.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Colors.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Condition.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Config.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/FileLock.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ForcePublic.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/LocalVector.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Logging.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Md5.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/MutexLock.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/NonCopyable.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/StdExtension.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/StringUtil.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Thread.hpp create mode 100644 funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ThreadPool.hpp diff --git a/funasr/runtime/onnxruntime/CMakeLists.txt b/funasr/runtime/onnxruntime/CMakeLists.txt index b9a26b2a3..64d1c8250 100644 --- a/funasr/runtime/onnxruntime/CMakeLists.txt +++ b/funasr/runtime/onnxruntime/CMakeLists.txt @@ -32,6 +32,8 @@ endif() include_directories(${PROJECT_SOURCE_DIR}/third_party/kaldi-native-fbank) include_directories(${PROJECT_SOURCE_DIR}/third_party/yaml-cpp/include) +include_directories(${PROJECT_SOURCE_DIR}/third_party/jieba/include) +include_directories(${PROJECT_SOURCE_DIR}/third_party/jieba/include/limonp/include) if(ENABLE_GLOG) include_directories(${PROJECT_SOURCE_DIR}/third_party/glog/src) diff --git a/funasr/runtime/onnxruntime/include/com-define.h b/funasr/runtime/onnxruntime/include/com-define.h index 9f28e15d3..0ecb9c2b6 100644 --- a/funasr/runtime/onnxruntime/include/com-define.h +++ b/funasr/runtime/onnxruntime/include/com-define.h @@ -107,4 +107,8 @@ namespace funasr { #define DUN_INDEX 5 #define CACHE_POP_TRIGGER_LIMIT 200 +#define JIEBA_DICT "jieba.c.dict" +#define JIEBA_USERDICT "jieba_usr_dict" +#define JIEBA_HMM_MODEL "jieba.hmm" + } // namespace funasr diff --git a/funasr/runtime/onnxruntime/src/ct-transformer.cpp b/funasr/runtime/onnxruntime/src/ct-transformer.cpp index 71a8847c4..8f8d95310 100644 --- a/funasr/runtime/onnxruntime/src/ct-transformer.cpp +++ b/funasr/runtime/onnxruntime/src/ct-transformer.cpp @@ -40,6 +40,7 @@ void CTTransformer::InitPunc(const std::string &punc_model, const std::string &p m_szOutputNames.push_back(item.c_str()); m_tokenizer.OpenYaml(punc_config.c_str()); + m_tokenizer.JiebaInit(punc_config); } CTTransformer::~CTTransformer() diff --git a/funasr/runtime/onnxruntime/src/tokenizer.cpp b/funasr/runtime/onnxruntime/src/tokenizer.cpp index cd3f02758..a111b9157 100644 --- a/funasr/runtime/onnxruntime/src/tokenizer.cpp +++ b/funasr/runtime/onnxruntime/src/tokenizer.cpp @@ -17,6 +17,41 @@ CTokenizer::CTokenizer():m_ready(false) CTokenizer::~CTokenizer() { + delete jieba_dict_trie_; + delete jieba_model_; +} + +void CTokenizer::SetJiebaRes(cppjieba::DictTrie *dict, cppjieba::HMMModel *hmm) { + jieba_processor_.SetJiebaRes(dict, hmm); +} + +void CTokenizer::JiebaInit(std::string punc_config){ + if (seg_jieba){ + std::string model_path = punc_config.substr(0, punc_config.length() - (sizeof(PUNC_CONFIG_NAME)-1)); + std::string jieba_dict_file = PathAppend(model_path, JIEBA_DICT); + std::string jieba_hmm_file = PathAppend(model_path, JIEBA_HMM_MODEL); + std::string jieba_userdict_file = PathAppend(model_path, JIEBA_USERDICT); + try{ + jieba_dict_trie_ = new cppjieba::DictTrie(jieba_dict_file, jieba_userdict_file); + LOG(INFO) << "Successfully load file from " << jieba_dict_file << ", " << jieba_userdict_file; + }catch(exception const &e){ + LOG(ERROR) << "Error loading file, Jieba dict file error or not exist."; + exit(-1); + } + + try{ + jieba_model_ = new cppjieba::HMMModel(jieba_hmm_file); + LOG(INFO) << "Successfully load model from " << jieba_hmm_file; + }catch(exception const &e){ + LOG(ERROR) << "Error loading file, Jieba hmm file error or not exist."; + exit(-1); + } + + SetJiebaRes(jieba_dict_trie_, jieba_model_); + }else { + jieba_dict_trie_ = NULL; + jieba_model_ = NULL; + } } void CTokenizer::ReadYaml(const YAML::Node& node) @@ -50,6 +85,11 @@ bool CTokenizer::OpenYaml(const char* sz_yamlfile) try { + YAML::Node conf_seg_jieba = m_Config["seg_jieba"]; + if (conf_seg_jieba.IsDefined()){ + seg_jieba = conf_seg_jieba.as(); + } + auto Tokens = m_Config["token_list"]; if (Tokens.IsSequence()) { @@ -167,6 +207,14 @@ vector CTokenizer::SplitChineseString(const string & str_info) return list; } +vector CTokenizer::SplitChineseJieba(const string & str_info) +{ + vector list; + jieba_processor_.Cut(str_info, list, false); + + return list; +} + void CTokenizer::StrSplit(const string& str, const char split, vector& res) { if (str == "") @@ -184,7 +232,7 @@ void CTokenizer::StrSplit(const string& str, const char split, vector& r } } - void CTokenizer::Tokenize(const char* str_info, vector & str_out, vector & id_out) +void CTokenizer::Tokenize(const char* str_info, vector & str_out, vector & id_out) { vector strList; StrSplit(str_info,' ', strList); @@ -200,7 +248,12 @@ void CTokenizer::StrSplit(const string& str, const char split, vector& r if (current_chinese.size() > 0) { // for utf-8 chinese - auto chineseList = SplitChineseString(current_chinese); + vector chineseList; + if(seg_jieba){ + chineseList = SplitChineseJieba(current_chinese); + }else{ + chineseList = SplitChineseString(current_chinese); + } str_out.insert(str_out.end(), chineseList.begin(),chineseList.end()); current_chinese = ""; } @@ -218,7 +271,13 @@ void CTokenizer::StrSplit(const string& str, const char split, vector& r } if (current_chinese.size() > 0) { - auto chineseList = SplitChineseString(current_chinese); + // for utf-8 chinese + vector chineseList; + if(seg_jieba){ + chineseList = SplitChineseJieba(current_chinese); + }else{ + chineseList = SplitChineseString(current_chinese); + } str_out.insert(str_out.end(), chineseList.begin(), chineseList.end()); current_chinese = ""; } diff --git a/funasr/runtime/onnxruntime/src/tokenizer.h b/funasr/runtime/onnxruntime/src/tokenizer.h index 3b1d1c579..149161ba3 100644 --- a/funasr/runtime/onnxruntime/src/tokenizer.h +++ b/funasr/runtime/onnxruntime/src/tokenizer.h @@ -5,6 +5,9 @@ #pragma once #include +#include "cppjieba/DictTrie.hpp" +#include "cppjieba/HMMModel.hpp" +#include "cppjieba/Jieba.hpp" namespace funasr { class CTokenizer { @@ -14,6 +17,10 @@ private: vector m_id2token,m_id2punc; map m_token2id,m_punc2id; + cppjieba::DictTrie *jieba_dict_trie_; + cppjieba::HMMModel *jieba_model_; + cppjieba::Jieba jieba_processor_; + public: CTokenizer(const char* sz_yamlfile); @@ -28,9 +35,13 @@ public: string Id2Punc(int n_punc_id); vector Punc2Ids(vector input); vector SplitChineseString(const string& str_info); + vector SplitChineseJieba(const string& str_info); void StrSplit(const string& str, const char split, vector& res); void Tokenize(const char* str_info, vector& str_out, vector& id_out); bool IsPunc(string& Punc); + bool seg_jieba = false; + void SetJiebaRes(cppjieba::DictTrie *dict, cppjieba::HMMModel *hmm); + void JiebaInit(std::string punc_config); }; } // namespace funasr diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/DictTrie.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/DictTrie.hpp new file mode 100644 index 000000000..c219a017e --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/DictTrie.hpp @@ -0,0 +1,274 @@ +#ifndef CPPJIEBA_DICT_TRIE_HPP +#define CPPJIEBA_DICT_TRIE_HPP + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "limonp/StringUtil.hpp" +#include "limonp/Logging.hpp" +#include "Unicode.hpp" +#include "Trie.hpp" + +namespace cppjieba { + +using namespace limonp; + +const double MIN_DOUBLE = -3.14e+100; +const double MAX_DOUBLE = 3.14e+100; +const size_t DICT_COLUMN_NUM = 3; +const char* const UNKNOWN_TAG = ""; + +class DictTrie { + public: + enum UserWordWeightOption { + WordWeightMin, + WordWeightMedian, + WordWeightMax, + }; // enum UserWordWeightOption + + DictTrie(const string& dict_path, const string& user_dict_paths = "", UserWordWeightOption user_word_weight_opt = WordWeightMedian) { + Init(dict_path, user_dict_paths, user_word_weight_opt); + } + ~DictTrie() { + delete trie_; + } + + bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { + DictUnit node_info; + if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) { + return false; + } + active_node_infos_.push_back(node_info); + trie_->InsertNode(node_info.word, &active_node_infos_.back()); + return true; + } + + bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) { + DictUnit node_info; + double weight = freq ? log(1.0 * freq / freq_sum_) : user_word_default_weight_ ; + if (!MakeNodeInfo(node_info, word, weight , tag)) { + return false; + } + active_node_infos_.push_back(node_info); + trie_->InsertNode(node_info.word, &active_node_infos_.back()); + return true; + } + + bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) { + DictUnit node_info; + if (!MakeNodeInfo(node_info, word, user_word_default_weight_, tag)) { + return false; + } + trie_->DeleteNode(node_info.word, &node_info); + return true; + } + + const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { + return trie_->Find(begin, end); + } + + void Find(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, + vector&res, + size_t max_word_len = MAX_WORD_LENGTH) const { + trie_->Find(begin, end, res, max_word_len); + } + + bool Find(const string& word) + { + const DictUnit *tmp = NULL; + RuneStrArray runes; + if (!DecodeRunesInString(word, runes)) + { + XLOG(ERROR) << "Decode failed."; + } + tmp = Find(runes.begin(), runes.end()); + if (tmp == NULL) + { + return false; + } + else + { + return true; + } + } + + bool IsUserDictSingleChineseWord(const Rune& word) const { + return IsIn(user_dict_single_chinese_word_, word); + } + + double GetMinWeight() const { + return min_weight_; + } + + void InserUserDictNode(const string& line) { + vector buf; + DictUnit node_info; + Split(line, buf, " "); + if(buf.size() == 1){ + MakeNodeInfo(node_info, + buf[0], + user_word_default_weight_, + UNKNOWN_TAG); + } else if (buf.size() == 2) { + MakeNodeInfo(node_info, + buf[0], + user_word_default_weight_, + buf[1]); + } else if (buf.size() == 3) { + int freq = atoi(buf[1].c_str()); + assert(freq_sum_ > 0.0); + double weight = log(1.0 * freq / freq_sum_); + MakeNodeInfo(node_info, buf[0], weight, buf[2]); + } + static_node_infos_.push_back(node_info); + if (node_info.word.size() == 1) { + user_dict_single_chinese_word_.insert(node_info.word[0]); + } + } + + void LoadUserDict(const vector& buf) { + for (size_t i = 0; i < buf.size(); i++) { + InserUserDictNode(buf[i]); + } + } + + void LoadUserDict(const set& buf) { + std::set::const_iterator iter; + for (iter = buf.begin(); iter != buf.end(); iter++){ + InserUserDictNode(*iter); + } + } + + void LoadUserDict(const string& filePaths) { + vector files = limonp::Split(filePaths, "|;"); + size_t lineno = 0; + for (size_t i = 0; i < files.size(); i++) { + ifstream ifs(files[i].c_str()); + XCHECK(ifs.is_open()) << "open " << files[i] << " failed"; + string line; + + for (; getline(ifs, line); lineno++) { + if (line.size() == 0) { + continue; + } + InserUserDictNode(line); + } + } + } + + + private: + void Init(const string& dict_path, const string& user_dict_paths, UserWordWeightOption user_word_weight_opt) { + LoadDict(dict_path); + Shrink(static_node_infos_); + CreateTrie(static_node_infos_); + } + + void CreateTrie(const vector& dictUnits) { + assert(dictUnits.size()); + vector words; + vector valuePointers; + for (size_t i = 0 ; i < dictUnits.size(); i ++) { + words.push_back(dictUnits[i].word); + valuePointers.push_back(&dictUnits[i]); + } + trie_ = new Trie(words, valuePointers); + } + + bool MakeNodeInfo(DictUnit& node_info, + const string& word, + double weight, + const string& tag) { + if (!DecodeRunesInString(word, node_info.word)) { + XLOG(ERROR) << "Decode " << word << " failed."; + return false; + } + node_info.weight = weight; + node_info.tag = tag; + return true; + } + + void LoadDict(const string& filePath) { + ifstream ifs(filePath.c_str()); + XCHECK(ifs.is_open()) << "open " << filePath << " failed."; + string line; + vector buf; + + DictUnit node_info; + for (size_t lineno = 0; getline(ifs, line); lineno++) { + Split(line, buf, " "); + XCHECK(buf.size() == DICT_COLUMN_NUM) << "split result illegal, line:" << line; + MakeNodeInfo(node_info, + buf[0], + atof(buf[1].c_str()), + buf[2]); + static_node_infos_.push_back(node_info); + } + } + + static bool WeightCompare(const DictUnit& lhs, const DictUnit& rhs) { + return lhs.weight < rhs.weight; + } + + void SetStaticWordWeights(UserWordWeightOption option) { + XCHECK(!static_node_infos_.empty()); + vector x = static_node_infos_; + sort(x.begin(), x.end(), WeightCompare); + min_weight_ = x[0].weight; + max_weight_ = x[x.size() - 1].weight; + median_weight_ = x[x.size() / 2].weight; + switch (option) { + case WordWeightMin: + user_word_default_weight_ = min_weight_; + break; + case WordWeightMedian: + user_word_default_weight_ = median_weight_; + break; + default: + user_word_default_weight_ = max_weight_; + break; + } + } + + double CalcFreqSum(const vector& node_infos) const { + double sum = 0.0; + for (size_t i = 0; i < node_infos.size(); i++) { + sum += node_infos[i].weight; + } + return sum; + } + + void CalculateWeight(vector& node_infos, double sum) const { + assert(sum > 0.0); + for (size_t i = 0; i < node_infos.size(); i++) { + DictUnit& node_info = node_infos[i]; + assert(node_info.weight > 0.0); + node_info.weight = log(double(node_info.weight)/sum); + } + } + + void Shrink(vector& units) const { + vector(units.begin(), units.end()).swap(units); + } + + vector static_node_infos_; + deque active_node_infos_; // must not be vector + Trie * trie_; + + double freq_sum_; + double min_weight_; + double max_weight_; + double median_weight_; + double user_word_default_weight_; + unordered_set user_dict_single_chinese_word_; +}; +} + +#endif diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/FullSegment.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/FullSegment.hpp new file mode 100644 index 000000000..2295ceb11 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/FullSegment.hpp @@ -0,0 +1,102 @@ +#ifndef CPPJIEBA_FULLSEGMENT_H +#define CPPJIEBA_FULLSEGMENT_H + +#include +#include +#include +#include "limonp/Logging.hpp" +#include "DictTrie.hpp" +#include "SegmentBase.hpp" +#include "Unicode.hpp" + +namespace cppjieba { +class FullSegment: public SegmentBase { + public: + FullSegment(const string& dictPath) { + dictTrie_ = new DictTrie(dictPath); + isNeedDestroy_ = true; + } + FullSegment(const DictTrie* dictTrie) + : dictTrie_(dictTrie), isNeedDestroy_(false) { + assert(dictTrie_); + } + FullSegment() { + dictTrie_ = NULL; + } + ~FullSegment() { + if (isNeedDestroy_) { + delete dictTrie_; + } + } + + void setRes(DictTrie *&dictTrie) { + dictTrie_ = dictTrie; + isNeedDestroy_ = false; + assert(dictTrie_); + } + void Cut(const string& sentence, + vector& words) const { + vector tmp; + Cut(sentence, tmp); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, + vector& words) const { + PreFilter pre_filter(symbols_, sentence); + PreFilter::Range range; + vector wrs; + wrs.reserve(sentence.size()/2); + while (pre_filter.HasNext()) { + range = pre_filter.Next(); + Cut(range.begin, range.end, wrs); + } + words.clear(); + words.reserve(wrs.size()); + GetWordsFromWordRanges(sentence, wrs, words); + } + void Cut(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, + vector& res) const { + // result of searching in trie tree + LocalVector > tRes; + + // max index of res's words + size_t maxIdx = 0; + + // always equals to (uItr - begin) + size_t uIdx = 0; + + // tmp variables + size_t wordLen = 0; + assert(dictTrie_); + vector dags; + dictTrie_->Find(begin, end, dags); + for (size_t i = 0; i < dags.size(); i++) { + for (size_t j = 0; j < dags[i].nexts.size(); j++) { + size_t nextoffset = dags[i].nexts[j].first; + assert(nextoffset < dags.size()); + const DictUnit* du = dags[i].nexts[j].second; + if (du == NULL) { + if (dags[i].nexts.size() == 1 && maxIdx <= uIdx) { + WordRange wr(begin + i, begin + nextoffset); + res.push_back(wr); + } + } else { + wordLen = du->word.size(); + if (wordLen >= 2 || (dags[i].nexts.size() == 1 && maxIdx <= uIdx)) { + WordRange wr(begin + i, begin + nextoffset); + res.push_back(wr); + } + } + maxIdx = uIdx + wordLen > maxIdx ? uIdx + wordLen : maxIdx; + } + uIdx++; + } + } + private: + const DictTrie* dictTrie_; + bool isNeedDestroy_; +}; +} + +#endif diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/HMMModel.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/HMMModel.hpp new file mode 100644 index 000000000..27e6b6622 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/HMMModel.hpp @@ -0,0 +1,129 @@ +#ifndef CPPJIEBA_HMMMODEL_H +#define CPPJIEBA_HMMMODEL_H + +#include "limonp/StringUtil.hpp" +#include "Trie.hpp" + +namespace cppjieba { + +using namespace limonp; +typedef unordered_map EmitProbMap; + +struct HMMModel { + /* + * STATUS: + * 0: HMMModel::B, 1: HMMModel::E, 2: HMMModel::M, 3:HMMModel::S + * */ + enum {B = 0, E = 1, M = 2, S = 3, STATUS_SUM = 4}; + + HMMModel(const string& modelPath) { + memset(startProb, 0, sizeof(startProb)); + memset(transProb, 0, sizeof(transProb)); + statMap[0] = 'B'; + statMap[1] = 'E'; + statMap[2] = 'M'; + statMap[3] = 'S'; + emitProbVec.push_back(&emitProbB); + emitProbVec.push_back(&emitProbE); + emitProbVec.push_back(&emitProbM); + emitProbVec.push_back(&emitProbS); + LoadModel(modelPath); + } + ~HMMModel() { + } + void LoadModel(const string& filePath) { + ifstream ifile(filePath.c_str()); + XCHECK(ifile.is_open()) << "open " << filePath << " failed"; + string line; + vector tmp; + vector tmp2; + //Load startProb + XCHECK(GetLine(ifile, line)); + Split(line, tmp, " "); + XCHECK(tmp.size() == STATUS_SUM); + for (size_t j = 0; j< tmp.size(); j++) { + startProb[j] = atof(tmp[j].c_str()); + } + + //Load transProb + for (size_t i = 0; i < STATUS_SUM; i++) { + XCHECK(GetLine(ifile, line)); + Split(line, tmp, " "); + XCHECK(tmp.size() == STATUS_SUM); + for (size_t j =0; j < STATUS_SUM; j++) { + transProb[i][j] = atof(tmp[j].c_str()); + } + } + + //Load emitProbB + XCHECK(GetLine(ifile, line)); + XCHECK(LoadEmitProb(line, emitProbB)); + + //Load emitProbE + XCHECK(GetLine(ifile, line)); + XCHECK(LoadEmitProb(line, emitProbE)); + + //Load emitProbM + XCHECK(GetLine(ifile, line)); + XCHECK(LoadEmitProb(line, emitProbM)); + + //Load emitProbS + XCHECK(GetLine(ifile, line)); + XCHECK(LoadEmitProb(line, emitProbS)); + } + double GetEmitProb(const EmitProbMap* ptMp, Rune key, + double defVal)const { + EmitProbMap::const_iterator cit = ptMp->find(key); + if (cit == ptMp->end()) { + return defVal; + } + return cit->second; + } + bool GetLine(ifstream& ifile, string& line) { + while (getline(ifile, line)) { + Trim(line); + if (line.empty()) { + continue; + } + if (StartsWith(line, "#")) { + continue; + } + return true; + } + return false; + } + bool LoadEmitProb(const string& line, EmitProbMap& mp) { + if (line.empty()) { + return false; + } + vector tmp, tmp2; + Unicode unicode; + Split(line, tmp, ","); + for (size_t i = 0; i < tmp.size(); i++) { + Split(tmp[i], tmp2, ":"); + if (2 != tmp2.size()) { + XLOG(ERROR) << "emitProb illegal."; + return false; + } + if (!DecodeRunesInString(tmp2[0], unicode) || unicode.size() != 1) { + XLOG(ERROR) << "TransCode failed."; + return false; + } + mp[unicode[0]] = atof(tmp2[1].c_str()); + } + return true; + } + + char statMap[STATUS_SUM]; + double startProb[STATUS_SUM]; + double transProb[STATUS_SUM][STATUS_SUM]; + EmitProbMap emitProbB; + EmitProbMap emitProbE; + EmitProbMap emitProbM; + EmitProbMap emitProbS; + vector emitProbVec; +}; // struct HMMModel + +} // namespace cppjieba + +#endif diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/HMMSegment.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/HMMSegment.hpp new file mode 100644 index 000000000..91a311dca --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/HMMSegment.hpp @@ -0,0 +1,197 @@ +#ifndef CPPJIBEA_HMMSEGMENT_H +#define CPPJIBEA_HMMSEGMENT_H + +#include +#include +#include +#include +#include "HMMModel.hpp" +#include "SegmentBase.hpp" + +namespace cppjieba { +class HMMSegment: public SegmentBase { + public: + HMMSegment(const string& filePath) + : model_(new HMMModel(filePath)), isNeedDestroy_(true) { + } + HMMSegment(const HMMModel* model) + : model_(model), isNeedDestroy_(false) { + } + HMMSegment() { + model_ = NULL; + } + + ~HMMSegment() { + if (isNeedDestroy_) { + delete model_; + } + } + void setRes(HMMModel *&model) { + model_ = model; + isNeedDestroy_ = false; + } + void Cut(const string& sentence, + vector& words) const { + vector tmp; + Cut(sentence, tmp); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, + vector& words) const { + PreFilter pre_filter(symbols_, sentence); + PreFilter::Range range; + vector wrs; + wrs.reserve(sentence.size()/2); + while (pre_filter.HasNext()) { + range = pre_filter.Next(); + Cut(range.begin, range.end, wrs); + } + words.clear(); + words.reserve(wrs.size()); + GetWordsFromWordRanges(sentence, wrs, words); + } + void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res) const { + RuneStrArray::const_iterator left = begin; + RuneStrArray::const_iterator right = begin; + while (right != end) { + if (right->rune < 0x80) { + if (left != right) { + InternalCut(left, right, res); + } + left = right; + do { + right = SequentialLetterRule(left, end); + if (right != left) { + break; + } + right = NumbersRule(left, end); + if (right != left) { + break; + } + right ++; + } while (false); + WordRange wr(left, right - 1); + res.push_back(wr); + left = right; + } else { + right++; + } + } + if (left != right) { + InternalCut(left, right, res); + } + } + private: + // sequential letters rule + RuneStrArray::const_iterator SequentialLetterRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { + Rune x = begin->rune; + if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z')) { + begin ++; + } else { + return begin; + } + while (begin != end) { + x = begin->rune; + if (('a' <= x && x <= 'z') || ('A' <= x && x <= 'Z') || ('0' <= x && x <= '9')) { + begin ++; + } else { + break; + } + } + return begin; + } + // + RuneStrArray::const_iterator NumbersRule(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { + Rune x = begin->rune; + if ('0' <= x && x <= '9') { + begin ++; + } else { + return begin; + } + while (begin != end) { + x = begin->rune; + if ( ('0' <= x && x <= '9') || x == '.') { + begin++; + } else { + break; + } + } + return begin; + } + void InternalCut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res) const { + vector status; + Viterbi(begin, end, status); + + RuneStrArray::const_iterator left = begin; + RuneStrArray::const_iterator right; + for (size_t i = 0; i < status.size(); i++) { + if (status[i] % 2) { //if (HMMModel::E == status[i] || HMMModel::S == status[i]) + right = begin + i + 1; + WordRange wr(left, right - 1); + res.push_back(wr); + left = right; + } + } + } + + void Viterbi(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, + vector& status) const { + size_t Y = HMMModel::STATUS_SUM; + size_t X = end - begin; + + size_t XYSize = X * Y; + size_t now, old, stat; + double tmp, endE, endS; + + vector path(XYSize); + vector weight(XYSize); + + //start + for (size_t y = 0; y < Y; y++) { + weight[0 + y * X] = model_->startProb[y] + model_->GetEmitProb(model_->emitProbVec[y], begin->rune, MIN_DOUBLE); + path[0 + y * X] = -1; + } + + double emitProb; + + for (size_t x = 1; x < X; x++) { + for (size_t y = 0; y < Y; y++) { + now = x + y*X; + weight[now] = MIN_DOUBLE; + path[now] = HMMModel::E; // warning + emitProb = model_->GetEmitProb(model_->emitProbVec[y], (begin+x)->rune, MIN_DOUBLE); + for (size_t preY = 0; preY < Y; preY++) { + old = x - 1 + preY * X; + tmp = weight[old] + model_->transProb[preY][y] + emitProb; + if (tmp > weight[now]) { + weight[now] = tmp; + path[now] = preY; + } + } + } + } + + endE = weight[X-1+HMMModel::E*X]; + endS = weight[X-1+HMMModel::S*X]; + stat = 0; + if (endE >= endS) { + stat = HMMModel::E; + } else { + stat = HMMModel::S; + } + + status.resize(X); + for (int x = X -1 ; x >= 0; x--) { + status[x] = stat; + stat = path[x + stat*X]; + } + } + + const HMMModel* model_; + bool isNeedDestroy_; +}; // class HMMSegment + +} // namespace cppjieba + +#endif diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Jieba.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Jieba.hpp new file mode 100644 index 000000000..0e778f993 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Jieba.hpp @@ -0,0 +1,141 @@ +#ifndef CPPJIEAB_JIEBA_H +#define CPPJIEAB_JIEBA_H + +#include "QuerySegment.hpp" +#include "KeywordExtractor.hpp" + +namespace cppjieba { + +class Jieba { + public: + Jieba(DictTrie *jieba_dict_trie, + HMMModel *jieba_model) + : dict_trie_(jieba_dict_trie), + model_(jieba_model), + mp_seg_(dict_trie_), + hmm_seg_(model_), + mix_seg_(dict_trie_, model_), + full_seg_(dict_trie_), + query_seg_(dict_trie_, model_) { + } + Jieba() { + dict_trie_ = NULL; + model_ = NULL; + } + ~Jieba() { + } + + struct LocWord { + string word; + size_t begin; + size_t end; + }; // struct LocWord + void SetJiebaRes(cppjieba::DictTrie *&dict, cppjieba::HMMModel *&hmm) { + dict_trie_ = dict; + model_ = hmm; + mp_seg_.setRes(dict); + hmm_seg_.setRes(hmm); + mix_seg_.setRes(dict, hmm); + full_seg_.setRes(dict); + query_seg_.setRes(dict, hmm); + } + void Cut(const string& sentence, vector& words, bool hmm = true) const { + mix_seg_.Cut(sentence, words, hmm); + } + void Cut(const string& sentence, vector& words, bool hmm = true) const { + mix_seg_.Cut(sentence, words, hmm); + } + void CutAll(const string& sentence, vector& words) const { + full_seg_.Cut(sentence, words); + } + void CutAll(const string& sentence, vector& words) const { + full_seg_.Cut(sentence, words); + } + void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { + query_seg_.Cut(sentence, words, hmm); + } + void CutForSearch(const string& sentence, vector& words, bool hmm = true) const { + query_seg_.Cut(sentence, words, hmm); + } + void CutHMM(const string& sentence, vector& words) const { + hmm_seg_.Cut(sentence, words); + } + void CutHMM(const string& sentence, vector& words) const { + hmm_seg_.Cut(sentence, words); + } + void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { + mp_seg_.Cut(sentence, words, max_word_len); + } + void CutSmall(const string& sentence, vector& words, size_t max_word_len) const { + mp_seg_.Cut(sentence, words, max_word_len); + } + + void Tag(const string& sentence, vector >& words) const { + mix_seg_.Tag(sentence, words); + } + string LookupTag(const string &str) const { + return mix_seg_.LookupTag(str); + } + bool InsertUserWord(const string& word, const string& tag = UNKNOWN_TAG) { + return dict_trie_->InsertUserWord(word, tag); + } + + bool InsertUserWord(const string& word,int freq, const string& tag = UNKNOWN_TAG) { + return dict_trie_->InsertUserWord(word,freq, tag); + } + + bool DeleteUserWord(const string& word, const string& tag = UNKNOWN_TAG) { + return dict_trie_->DeleteUserWord(word, tag); + } + + bool Find(const string& word) + { + return dict_trie_->Find(word); + } + + void ResetSeparators(const string& s) { + //TODO + mp_seg_.ResetSeparators(s); + hmm_seg_.ResetSeparators(s); + mix_seg_.ResetSeparators(s); + full_seg_.ResetSeparators(s); + query_seg_.ResetSeparators(s); + } + + const DictTrie* GetDictTrie() const { + return dict_trie_; + } + + const HMMModel* GetHMMModel() const { + return model_; + } + + void LoadUserDict(const vector& buf) { + dict_trie_->LoadUserDict(buf); + } + + void LoadUserDict(const set& buf) { + dict_trie_->LoadUserDict(buf); + } + + void LoadUserDict(const string& path) { + dict_trie_->LoadUserDict(path); + } + + private: + DictTrie *dict_trie_; + HMMModel *model_; + + // They share the same dict trie and model + MPSegment mp_seg_; + HMMSegment hmm_seg_; + MixSegment mix_seg_; + FullSegment full_seg_; + QuerySegment query_seg_; + + public: +}; // class Jieba + +} // namespace cppjieba + +#endif // CPPJIEAB_JIEBA_H diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/KeywordExtractor.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/KeywordExtractor.hpp new file mode 100644 index 000000000..15b50b932 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/KeywordExtractor.hpp @@ -0,0 +1,154 @@ +#ifndef CPPJIEBA_KEYWORD_EXTRACTOR_H +#define CPPJIEBA_KEYWORD_EXTRACTOR_H + +#include +#include +#include "MixSegment.hpp" + +namespace cppjieba { + +using namespace limonp; +using namespace std; + +/*utf8*/ +class KeywordExtractor { + public: + struct Word { + string word; + vector offsets; + double weight; + }; // struct Word + + KeywordExtractor(const string& dictPath, + const string& hmmFilePath, + const string& idfPath, + const string& stopWordPath, + const string& userDict = "") + : segment_(dictPath, hmmFilePath, userDict) { + LoadIdfDict(idfPath); + LoadStopWordDict(stopWordPath); + } + KeywordExtractor(const DictTrie* dictTrie, + const HMMModel* model, + const string& idfPath, + const string& stopWordPath) + : segment_(dictTrie, model) { + LoadIdfDict(idfPath); + LoadStopWordDict(stopWordPath); + } + KeywordExtractor() {} + ~KeywordExtractor() { + } + + void Extract(const string& sentence, vector& keywords, size_t topN) const { + vector topWords; + Extract(sentence, topWords, topN); + for (size_t i = 0; i < topWords.size(); i++) { + keywords.push_back(topWords[i].word); + } + } + + void Extract(const string& sentence, vector >& keywords, size_t topN) const { + vector topWords; + Extract(sentence, topWords, topN); + for (size_t i = 0; i < topWords.size(); i++) { + keywords.push_back(pair(topWords[i].word, topWords[i].weight)); + } + } + + void Extract(const string& sentence, vector& keywords, size_t topN) const { + vector words; + segment_.Cut(sentence, words); + + map wordmap; + size_t offset = 0; + for (size_t i = 0; i < words.size(); ++i) { + size_t t = offset; + offset += words[i].size(); + if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { + continue; + } + wordmap[words[i]].offsets.push_back(t); + wordmap[words[i]].weight += 1.0; + } + if (offset != sentence.size()) { + XLOG(ERROR) << "words illegal"; + return; + } + + keywords.clear(); + keywords.reserve(wordmap.size()); + for (map::iterator itr = wordmap.begin(); itr != wordmap.end(); ++itr) { + unordered_map::const_iterator cit = idfMap_.find(itr->first); + if (cit != idfMap_.end()) { + itr->second.weight *= cit->second; + } else { + itr->second.weight *= idfAverage_; + } + itr->second.word = itr->first; + keywords.push_back(itr->second); + } + topN = min(topN, keywords.size()); + partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); + keywords.resize(topN); + } + private: + void LoadIdfDict(const string& idfPath) { + ifstream ifs(idfPath.c_str()); + XCHECK(ifs.is_open()) << "open " << idfPath << " failed"; + string line ; + vector buf; + double idf = 0.0; + double idfSum = 0.0; + size_t lineno = 0; + for (; getline(ifs, line); lineno++) { + buf.clear(); + if (line.empty()) { + XLOG(ERROR) << "lineno: " << lineno << " empty. skipped."; + continue; + } + Split(line, buf, " "); + if (buf.size() != 2) { + XLOG(ERROR) << "line: " << line << ", lineno: " << lineno << " empty. skipped."; + continue; + } + idf = atof(buf[1].c_str()); + idfMap_[buf[0]] = idf; + idfSum += idf; + + } + + assert(lineno); + idfAverage_ = idfSum / lineno; + assert(idfAverage_ > 0.0); + } + void LoadStopWordDict(const string& filePath) { + ifstream ifs(filePath.c_str()); + XCHECK(ifs.is_open()) << "open " << filePath << " failed"; + string line ; + while (getline(ifs, line)) { + stopWords_.insert(line); + } + assert(stopWords_.size()); + } + + static bool Compare(const Word& lhs, const Word& rhs) { + return lhs.weight > rhs.weight; + } + + MixSegment segment_; + unordered_map idfMap_; + double idfAverage_; + + unordered_set stopWords_; +}; // class KeywordExtractor + +inline ostream& operator << (ostream& os, const KeywordExtractor::Word& word) { + return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; +} + +} // namespace cppjieba + +#endif + + diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/MPSegment.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/MPSegment.hpp new file mode 100644 index 000000000..524dcb0c0 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/MPSegment.hpp @@ -0,0 +1,144 @@ +#ifndef CPPJIEBA_MPSEGMENT_H +#define CPPJIEBA_MPSEGMENT_H + +#include +#include +#include +#include "limonp/Logging.hpp" +#include "DictTrie.hpp" +#include "SegmentTagged.hpp" +#include "PosTagger.hpp" + +namespace cppjieba { + +class MPSegment: public SegmentTagged { + public: + MPSegment(const string& dictPath, const string& userDictPath = "") + : dictTrie_(new DictTrie(dictPath, userDictPath)), isNeedDestroy_(true) { + } + MPSegment(const DictTrie* dictTrie) + : dictTrie_(dictTrie), isNeedDestroy_(false) { + assert(dictTrie_); + } + MPSegment() { + dictTrie_ = NULL; + } + ~MPSegment() { + if (isNeedDestroy_) { + delete dictTrie_; + } + } + void setRes(DictTrie *&dictTrie) { + dictTrie_ = dictTrie; + isNeedDestroy_ = false; + assert(dictTrie_); + } + void Cut(const string& sentence, vector& words) const { + Cut(sentence, words, MAX_WORD_LENGTH); + } + + void Cut(const string& sentence, + vector& words, + size_t max_word_len) const { + vector tmp; + Cut(sentence, tmp, max_word_len); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, + vector& words, + size_t max_word_len = MAX_WORD_LENGTH) const { + PreFilter pre_filter(symbols_, sentence); + PreFilter::Range range; + vector wrs; + wrs.reserve(sentence.size()/2); + while (pre_filter.HasNext()) { + range = pre_filter.Next(); + Cut(range.begin, range.end, wrs, max_word_len); + } + words.clear(); + words.reserve(wrs.size()); + GetWordsFromWordRanges(sentence, wrs, words); + } + void Cut(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, + vector& words, + size_t max_word_len = MAX_WORD_LENGTH) const { + vector dags; + dictTrie_->Find(begin, + end, + dags, + max_word_len); + CalcDP(dags); + CutByDag(begin, end, dags, words); + } + + const DictTrie* GetDictTrie() const { + return dictTrie_; + } + + bool Tag(const string& src, vector >& res) const { + return tagger_.Tag(src, res, *this); + } + + bool IsUserDictSingleChineseWord(const Rune& value) const { + return dictTrie_->IsUserDictSingleChineseWord(value); + } + private: + void CalcDP(vector& dags) const { + size_t nextPos; + const DictUnit* p; + double val; + + for (vector::reverse_iterator rit = dags.rbegin(); rit != dags.rend(); rit++) { + rit->pInfo = NULL; + rit->weight = MIN_DOUBLE; + assert(!rit->nexts.empty()); + for (LocalVector >::const_iterator it = rit->nexts.begin(); it != rit->nexts.end(); it++) { + nextPos = it->first; + p = it->second; + val = 0.0; + if (nextPos + 1 < dags.size()) { + val += dags[nextPos + 1].weight; + } + + if (p) { + val += p->weight; + } else { + val += dictTrie_->GetMinWeight(); + } + if (val > rit->weight) { + rit->pInfo = p; + rit->weight = val; + } + } + } + } + void CutByDag(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, + const vector& dags, + vector& words) const { + size_t i = 0; + while (i < dags.size()) { + const DictUnit* p = dags[i].pInfo; + if (p) { + assert(p->word.size() >= 1); + WordRange wr(begin + i, begin + i + p->word.size() - 1); + words.push_back(wr); + i += p->word.size(); + } else { //single chinese word + WordRange wr(begin + i, begin + i); + words.push_back(wr); + i++; + } + } + } + + const DictTrie* dictTrie_; + bool isNeedDestroy_; + PosTagger tagger_; + +}; // class MPSegment + +} // namespace cppjieba + +#endif diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/MixSegment.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/MixSegment.hpp new file mode 100644 index 000000000..c0582198a --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/MixSegment.hpp @@ -0,0 +1,113 @@ +#ifndef CPPJIEBA_MIXSEGMENT_H +#define CPPJIEBA_MIXSEGMENT_H + +#include +#include "MPSegment.hpp" +#include "HMMSegment.hpp" +#include "limonp/StringUtil.hpp" +#include "PosTagger.hpp" + +namespace cppjieba { +class MixSegment: public SegmentTagged { + public: + MixSegment(const string& mpSegDict, const string& hmmSegDict, + const string& userDict = "") + : mpSeg_(mpSegDict, userDict), + hmmSeg_(hmmSegDict) { + } + MixSegment(const DictTrie* dictTrie, const HMMModel* model) + : mpSeg_(dictTrie), hmmSeg_(model) { + } + MixSegment() {} + ~MixSegment() { + } + void setRes(DictTrie *&dictTrie, HMMModel *&model) { + mpSeg_.setRes(dictTrie); + hmmSeg_.setRes(model); + } + void Cut(const string& sentence, vector& words) const { + Cut(sentence, words, true); + } + void Cut(const string& sentence, vector& words, bool hmm) const { + vector tmp; + Cut(sentence, tmp, hmm); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, vector& words, bool hmm = true) const { + PreFilter pre_filter(symbols_, sentence); + PreFilter::Range range; + vector wrs; + wrs.reserve(sentence.size() / 2); + while (pre_filter.HasNext()) { + range = pre_filter.Next(); + Cut(range.begin, range.end, wrs, hmm); + } + words.clear(); + words.reserve(wrs.size()); + GetWordsFromWordRanges(sentence, wrs, words); + } + + void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { + if (!hmm) { + mpSeg_.Cut(begin, end, res); + return; + } + vector words; + assert(end >= begin); + words.reserve(end - begin); + mpSeg_.Cut(begin, end, words); + + vector hmmRes; + hmmRes.reserve(end - begin); + for (size_t i = 0; i < words.size(); i++) { + //if mp Get a word, it's ok, put it into result + if (words[i].left != words[i].right || (words[i].left == words[i].right && mpSeg_.IsUserDictSingleChineseWord(words[i].left->rune))) { + res.push_back(words[i]); + continue; + } + + // if mp Get a single one and it is not in userdict, collect it in sequence + size_t j = i; + while (j < words.size() && words[j].left == words[j].right && !mpSeg_.IsUserDictSingleChineseWord(words[j].left->rune)) { + j++; + } + + // Cut the sequence with hmm + assert(j - 1 >= i); + // TODO + hmmSeg_.Cut(words[i].left, words[j - 1].left + 1, hmmRes); + //put hmm result to result + for (size_t k = 0; k < hmmRes.size(); k++) { + res.push_back(hmmRes[k]); + } + + //clear tmp vars + hmmRes.clear(); + + //let i jump over this piece + i = j - 1; + } + } + + const DictTrie* GetDictTrie() const { + return mpSeg_.GetDictTrie(); + } + + bool Tag(const string& src, vector >& res) const { + return tagger_.Tag(src, res, *this); + } + + string LookupTag(const string &str) const { + return tagger_.LookupTag(str, *this); + } + + private: + MPSegment mpSeg_; + HMMSegment hmmSeg_; + PosTagger tagger_; + +}; // class MixSegment + +} // namespace cppjieba + +#endif diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/PosTagger.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/PosTagger.hpp new file mode 100644 index 000000000..78853d53c --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/PosTagger.hpp @@ -0,0 +1,77 @@ +#ifndef CPPJIEBA_POS_TAGGING_H +#define CPPJIEBA_POS_TAGGING_H + +#include "limonp/StringUtil.hpp" +#include "SegmentTagged.hpp" +#include "DictTrie.hpp" + +namespace cppjieba { +using namespace limonp; + +static const char* const POS_M = "m"; +static const char* const POS_ENG = "eng"; +static const char* const POS_X = "x"; + +class PosTagger { + public: + PosTagger() { + } + ~PosTagger() { + } + + bool Tag(const string& src, vector >& res, const SegmentTagged& segment) const { + vector CutRes; + segment.Cut(src, CutRes); + + for (vector::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { + res.push_back(make_pair(*itr, LookupTag(*itr, segment))); + } + return !res.empty(); + } + + string LookupTag(const string &str, const SegmentTagged& segment) const { + const DictUnit *tmp = NULL; + RuneStrArray runes; + const DictTrie * dict = segment.GetDictTrie(); + assert(dict != NULL); + if (!DecodeRunesInString(str, runes)) { + XLOG(ERROR) << "Decode failed."; + return POS_X; + } + tmp = dict->Find(runes.begin(), runes.end()); + if (tmp == NULL || tmp->tag.empty()) { + return SpecialRule(runes); + } else { + return tmp->tag; + } + } + + private: + const char* SpecialRule(const RuneStrArray& unicode) const { + size_t m = 0; + size_t eng = 0; + for (size_t i = 0; i < unicode.size() && eng < unicode.size() / 2; i++) { + if (unicode[i].rune < 0x80) { + eng ++; + if ('0' <= unicode[i].rune && unicode[i].rune <= '9') { + m++; + } + } + } + // ascii char is not found + if (eng == 0) { + return POS_X; + } + // all the ascii is number char + if (m == eng) { + return POS_M; + } + // the ascii chars contain english letter + return POS_ENG; + } + +}; // class PosTagger + +} // namespace cppjieba + +#endif diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/PreFilter.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/PreFilter.hpp new file mode 100644 index 000000000..ecb81c0bb --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/PreFilter.hpp @@ -0,0 +1,54 @@ +#ifndef CPPJIEBA_PRE_FILTER_H +#define CPPJIEBA_PRE_FILTER_H + +#include "Trie.hpp" +#include "limonp/Logging.hpp" + +namespace cppjieba { + +class PreFilter { + public: + //TODO use WordRange instead of Range + struct Range { + RuneStrArray::const_iterator begin; + RuneStrArray::const_iterator end; + }; // struct Range + + PreFilter(const unordered_set& symbols, + const string& sentence) + : symbols_(symbols) { + if (!DecodeRunesInString(sentence, sentence_)) { + XLOG(ERROR) << "decode failed. "; + } + cursor_ = sentence_.begin(); + } + ~PreFilter() { + } + bool HasNext() const { + return cursor_ != sentence_.end(); + } + Range Next() { + Range range; + range.begin = cursor_; + while (cursor_ != sentence_.end()) { + if (IsIn(symbols_, cursor_->rune)) { + if (range.begin == cursor_) { + cursor_ ++; + } + range.end = cursor_; + return range; + } + cursor_ ++; + } + range.end = sentence_.end(); + return range; + } + private: + RuneStrArray::const_iterator cursor_; + RuneStrArray sentence_; + const unordered_set& symbols_; +}; // class PreFilter + +} // namespace cppjieba + +#endif // CPPJIEBA_PRE_FILTER_H diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/QuerySegment.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/QuerySegment.hpp new file mode 100644 index 000000000..8ba7a9ff2 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/QuerySegment.hpp @@ -0,0 +1,95 @@ +#ifndef CPPJIEBA_QUERYSEGMENT_H +#define CPPJIEBA_QUERYSEGMENT_H + +#include +#include +#include +#include "limonp/Logging.hpp" +#include "DictTrie.hpp" +#include "SegmentBase.hpp" +#include "FullSegment.hpp" +#include "MixSegment.hpp" +#include "Unicode.hpp" + +namespace cppjieba { +class QuerySegment: public SegmentBase { + public: + QuerySegment(const string& dict, const string& model, const string& userDict = "") + : mixSeg_(dict, model, userDict), + trie_(mixSeg_.GetDictTrie()) { + } + QuerySegment(const DictTrie* dictTrie, const HMMModel* model) + : mixSeg_(dictTrie, model), trie_(dictTrie) { + } + QuerySegment() { + trie_ = NULL; + } + ~QuerySegment() { + } + void setRes(DictTrie *&dictTrie, HMMModel *&model) { + mixSeg_.setRes(dictTrie, model); + trie_ = dictTrie; + } + void Cut(const string& sentence, vector& words) const { + Cut(sentence, words, true); + } + void Cut(const string& sentence, vector& words, bool hmm) const { + vector tmp; + Cut(sentence, tmp, hmm); + GetStringsFromWords(tmp, words); + } + void Cut(const string& sentence, vector& words, bool hmm = true) const { + PreFilter pre_filter(symbols_, sentence); + PreFilter::Range range; + vector wrs; + wrs.reserve(sentence.size()/2); + while (pre_filter.HasNext()) { + range = pre_filter.Next(); + Cut(range.begin, range.end, wrs, hmm); + } + words.clear(); + words.reserve(wrs.size()); + GetWordsFromWordRanges(sentence, wrs, words); + } + void Cut(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end, vector& res, bool hmm) const { + //use mix Cut first + vector mixRes; + mixSeg_.Cut(begin, end, mixRes, hmm); + + vector fullRes; + for (vector::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { + if (mixResItr->Length() > 2) { + for (size_t i = 0; i + 1 < mixResItr->Length(); i++) { + WordRange wr(mixResItr->left + i, mixResItr->left + i + 1); + if (trie_->Find(wr.left, wr.right + 1) != NULL) { + res.push_back(wr); + } + } + } + if (mixResItr->Length() > 3) { + for (size_t i = 0; i + 2 < mixResItr->Length(); i++) { + WordRange wr(mixResItr->left + i, mixResItr->left + i + 2); + if (trie_->Find(wr.left, wr.right + 1) != NULL) { + res.push_back(wr); + } + } + } + res.push_back(*mixResItr); + } + } + private: + bool IsAllAscii(const Unicode& s) const { + for(size_t i = 0; i < s.size(); i++) { + if (s[i] >= 0x80) { + return false; + } + } + return true; + } + MixSegment mixSeg_; + const DictTrie* trie_; +}; // QuerySegment + +} // namespace cppjieba + +#endif diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/SegmentBase.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/SegmentBase.hpp new file mode 100644 index 000000000..79c80094e --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/SegmentBase.hpp @@ -0,0 +1,46 @@ +#ifndef CPPJIEBA_SEGMENTBASE_H +#define CPPJIEBA_SEGMENTBASE_H + +#include "limonp/Logging.hpp" +#include "PreFilter.hpp" +#include + + +namespace cppjieba { + +const char* const SPECIAL_SEPARATORS = " \t\n\xEF\xBC\x8C\xE3\x80\x82"; + +using namespace limonp; + +class SegmentBase { + public: + SegmentBase() { + XCHECK(ResetSeparators(SPECIAL_SEPARATORS)); + } + virtual ~SegmentBase() { + } + + virtual void Cut(const string& sentence, vector& words) const = 0; + + bool ResetSeparators(const string& s) { + symbols_.clear(); + RuneStrArray runes; + if (!DecodeRunesInString(s, runes)) { + XLOG(ERROR) << "decode " << s << " failed"; + return false; + } + for (size_t i = 0; i < runes.size(); i++) { + if (!symbols_.insert(runes[i].rune).second) { + XLOG(ERROR) << s.substr(runes[i].offset, runes[i].len) << " already exists"; + return false; + } + } + return true; + } + protected: + unordered_set symbols_; +}; // class SegmentBase + +} // cppjieba + +#endif diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/SegmentTagged.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/SegmentTagged.hpp new file mode 100644 index 000000000..4d99a31af --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/SegmentTagged.hpp @@ -0,0 +1,23 @@ +#ifndef CPPJIEBA_SEGMENTTAGGED_H +#define CPPJIEBA_SEGMENTTAGGED_H + +#include "SegmentBase.hpp" + +namespace cppjieba { + +class SegmentTagged : public SegmentBase{ + public: + SegmentTagged() { + } + virtual ~SegmentTagged() { + } + + virtual bool Tag(const string& src, vector >& res) const = 0; + + virtual const DictTrie* GetDictTrie() const = 0; + +}; // class SegmentTagged + +} // cppjieba + +#endif diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/TextRankExtractor.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/TextRankExtractor.hpp new file mode 100644 index 000000000..292d0a8f9 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/TextRankExtractor.hpp @@ -0,0 +1,190 @@ +#ifndef CPPJIEBA_TEXTRANK_EXTRACTOR_H +#define CPPJIEBA_TEXTRANK_EXTRACTOR_H + +#include +#include "Jieba.hpp" + +namespace cppjieba { + using namespace limonp; + using namespace std; + + class TextRankExtractor { + public: + typedef struct _Word {string word;vector offsets;double weight;} Word; // struct Word + private: + typedef std::map WordMap; + + class WordGraph{ + private: + typedef double Score; + typedef string Node; + typedef std::set NodeSet; + + typedef std::map Edges; + typedef std::map Graph; + //typedef std::unordered_map Edges; + //typedef std::unordered_map Graph; + + double d; + Graph graph; + NodeSet nodeSet; + public: + WordGraph(): d(0.85) {}; + WordGraph(double in_d): d(in_d) {}; + + void addEdge(Node start,Node end,double weight){ + Edges temp; + Edges::iterator gotEdges; + nodeSet.insert(start); + nodeSet.insert(end); + graph[start][end]+=weight; + graph[end][start]+=weight; + } + + void rank(WordMap &ws,size_t rankTime=10){ + WordMap outSum; + Score wsdef, min_rank, max_rank; + + if( graph.size() == 0) + return; + + wsdef = 1.0 / graph.size(); + + for(Graph::iterator edges=graph.begin();edges!=graph.end();++edges){ + // edges->first start节点;edge->first end节点;edge->second 权重 + ws[edges->first].word=edges->first; + ws[edges->first].weight=wsdef; + outSum[edges->first].weight=0; + for(Edges::iterator edge=edges->second.begin();edge!=edges->second.end();++edge){ + outSum[edges->first].weight+=edge->second; + } + } + //sort(nodeSet.begin(),nodeSet.end()); 是否需要排序? + for( size_t i=0; ifirst end节点;edge->second 权重 + s += edge->second / outSum[edge->first].weight * ws[edge->first].weight; + ws[*node].weight = (1 - d) + d * s; + } + } + + min_rank=max_rank=ws.begin()->second.weight; + for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ + if( i->second.weight < min_rank ){ + min_rank = i->second.weight; + } + if( i->second.weight > max_rank ){ + max_rank = i->second.weight; + } + } + for(WordMap::iterator i = ws.begin(); i != ws.end(); i ++){ + ws[i->first].weight = (i->second.weight - min_rank / 10.0) / (max_rank - min_rank / 10.0); + } + } + }; + + public: + TextRankExtractor(const string& dictPath, + const string& hmmFilePath, + const string& stopWordPath, + const string& userDict = "") + : segment_(dictPath, hmmFilePath, userDict) { + LoadStopWordDict(stopWordPath); + } + TextRankExtractor(const DictTrie* dictTrie, + const HMMModel* model, + const string& stopWordPath) + : segment_(dictTrie, model) { + LoadStopWordDict(stopWordPath); + } + TextRankExtractor(const Jieba& jieba, const string& stopWordPath) : segment_(jieba.GetDictTrie(), jieba.GetHMMModel()) { + LoadStopWordDict(stopWordPath); + } + ~TextRankExtractor() { + } + + void Extract(const string& sentence, vector& keywords, size_t topN) const { + vector topWords; + Extract(sentence, topWords, topN); + for (size_t i = 0; i < topWords.size(); i++) { + keywords.push_back(topWords[i].word); + } + } + + void Extract(const string& sentence, vector >& keywords, size_t topN) const { + vector topWords; + Extract(sentence, topWords, topN); + for (size_t i = 0; i < topWords.size(); i++) { + keywords.push_back(pair(topWords[i].word, topWords[i].weight)); + } + } + + void Extract(const string& sentence, vector& keywords, size_t topN, size_t span=5,size_t rankTime=10) const { + vector words; + segment_.Cut(sentence, words); + + TextRankExtractor::WordGraph graph; + WordMap wordmap; + size_t offset = 0; + + for(size_t i=0; i < words.size(); i++){ + size_t t = offset; + offset += words[i].size(); + if (IsSingleWord(words[i]) || stopWords_.find(words[i]) != stopWords_.end()) { + continue; + } + for(size_t j=i+1,skip=0;jsecond); + } + + topN = min(topN, keywords.size()); + partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), Compare); + keywords.resize(topN); + } + private: + void LoadStopWordDict(const string& filePath) { + ifstream ifs(filePath.c_str()); + XCHECK(ifs.is_open()) << "open " << filePath << " failed"; + string line ; + while (getline(ifs, line)) { + stopWords_.insert(line); + } + assert(stopWords_.size()); + } + + static bool Compare(const Word &x,const Word &y){ + return x.weight > y.weight; + } + + MixSegment segment_; + unordered_set stopWords_; + }; // class TextRankExtractor + + inline ostream& operator << (ostream& os, const TextRankExtractor::Word& word) { + return os << "{\"word\": \"" << word.word << "\", \"offset\": " << word.offsets << ", \"weight\": " << word.weight << "}"; + } +} // namespace cppjieba + +#endif + + diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Trie.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Trie.hpp new file mode 100644 index 000000000..e6f71b1c9 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Trie.hpp @@ -0,0 +1,200 @@ +#ifndef CPPJIEBA_TRIE_HPP +#define CPPJIEBA_TRIE_HPP + +#include +#include +#include "limonp/StdExtension.hpp" +#include "Unicode.hpp" + +namespace cppjieba { + +using namespace std; + +const size_t MAX_WORD_LENGTH = 512; + +struct DictUnit { + Unicode word; + double weight; + string tag; +}; // struct DictUnit + +// for debugging +// inline ostream & operator << (ostream& os, const DictUnit& unit) { +// string s; +// s << unit.word; +// return os << StringFormat("%s %s %.3lf", s.c_str(), unit.tag.c_str(), unit.weight); +// } + +struct Dag { + RuneStr runestr; + // [offset, nexts.first] + limonp::LocalVector > nexts; + const DictUnit * pInfo; + double weight; + size_t nextPos; // TODO + Dag():runestr(), pInfo(NULL), weight(0.0), nextPos(0) { + } +}; // struct Dag + +typedef Rune TrieKey; + +class TrieNode { + public : + TrieNode(): next(NULL), ptValue(NULL) { + } + public: + typedef unordered_map NextMap; + NextMap *next; + const DictUnit *ptValue; +}; + +class Trie { + public: + Trie(const vector& keys, const vector& valuePointers) + : root_(new TrieNode) { + CreateTrie(keys, valuePointers); + } + ~Trie() { + DeleteNode(root_); + } + + const DictUnit* Find(RuneStrArray::const_iterator begin, RuneStrArray::const_iterator end) const { + if (begin == end) { + return NULL; + } + + const TrieNode* ptNode = root_; + TrieNode::NextMap::const_iterator citer; + for (RuneStrArray::const_iterator it = begin; it != end; it++) { + if (NULL == ptNode->next) { + return NULL; + } + citer = ptNode->next->find(it->rune); + if (ptNode->next->end() == citer) { + return NULL; + } + ptNode = citer->second; + } + return ptNode->ptValue; + } + + void Find(RuneStrArray::const_iterator begin, + RuneStrArray::const_iterator end, + vector&res, + size_t max_word_len = MAX_WORD_LENGTH) const { + assert(root_ != NULL); + res.resize(end - begin); + + const TrieNode *ptNode = NULL; + TrieNode::NextMap::const_iterator citer; + for (size_t i = 0; i < size_t(end - begin); i++) { + res[i].runestr = *(begin + i); + + if (root_->next != NULL && root_->next->end() != (citer = root_->next->find(res[i].runestr.rune))) { + ptNode = citer->second; + } else { + ptNode = NULL; + } + if (ptNode != NULL) { + res[i].nexts.push_back(pair(i, ptNode->ptValue)); + } else { + res[i].nexts.push_back(pair(i, static_cast(NULL))); + } + + for (size_t j = i + 1; j < size_t(end - begin) && (j - i + 1) <= max_word_len; j++) { + if (ptNode == NULL || ptNode->next == NULL) { + break; + } + citer = ptNode->next->find((begin + j)->rune); + if (ptNode->next->end() == citer) { + break; + } + ptNode = citer->second; + if (NULL != ptNode->ptValue) { + res[i].nexts.push_back(pair(j, ptNode->ptValue)); + } + } + } + } + + void InsertNode(const Unicode& key, const DictUnit* ptValue) { + if (key.begin() == key.end()) { + return; + } + + TrieNode::NextMap::const_iterator kmIter; + TrieNode *ptNode = root_; + for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) { + if (NULL == ptNode->next) { + ptNode->next = new TrieNode::NextMap; + } + kmIter = ptNode->next->find(*citer); + if (ptNode->next->end() == kmIter) { + TrieNode *nextNode = new TrieNode; + + ptNode->next->insert(make_pair(*citer, nextNode)); + ptNode = nextNode; + } else { + ptNode = kmIter->second; + } + } + assert(ptNode != NULL); + ptNode->ptValue = ptValue; + } + void DeleteNode(const Unicode& key, const DictUnit* ptValue) { + if (key.begin() == key.end()) { + return; + } + //定义一个NextMap迭代器 + TrieNode::NextMap::const_iterator kmIter; + //定义一个指向root的TrieNode指针 + TrieNode *ptNode = root_; + for (Unicode::const_iterator citer = key.begin(); citer != key.end(); ++citer) { + //链表不存在元素 + if (NULL == ptNode->next) { + return; + } + kmIter = ptNode->next->find(*citer); + //如果map中不存在,跳出循环 + if (ptNode->next->end() == kmIter) { + break; + } + //从unordered_map中擦除该项 + ptNode->next->erase(*citer); + //删除该node + ptNode = kmIter->second; + delete ptNode; + break; + } + return; + } + private: + void CreateTrie(const vector& keys, const vector& valuePointers) { + if (valuePointers.empty() || keys.empty()) { + return; + } + assert(keys.size() == valuePointers.size()); + + for (size_t i = 0; i < keys.size(); i++) { + InsertNode(keys[i], valuePointers[i]); + } + } + + void DeleteNode(TrieNode* node) { + if (NULL == node) { + return; + } + if (NULL != node->next) { + for (TrieNode::NextMap::iterator it = node->next->begin(); it != node->next->end(); ++it) { + DeleteNode(it->second); + } + delete node->next; + } + delete node; + } + + TrieNode* root_; +}; // class Trie +} // namespace cppjieba + +#endif // CPPJIEBA_TRIE_HPP diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Unicode.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Unicode.hpp new file mode 100644 index 000000000..7f064569a --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/cppjieba/Unicode.hpp @@ -0,0 +1,227 @@ +#ifndef CPPJIEBA_UNICODE_H +#define CPPJIEBA_UNICODE_H + +#include +#include +#include +#include +#include +#include "limonp/LocalVector.hpp" + +namespace cppjieba { + +using std::string; +using std::vector; + +typedef uint32_t Rune; + +struct Word { + string word; + uint32_t offset; + uint32_t unicode_offset; + uint32_t unicode_length; + Word(const string& w, uint32_t o) + : word(w), offset(o) { + } + Word(const string& w, uint32_t o, uint32_t unicode_offset, uint32_t unicode_length) + : word(w), offset(o), unicode_offset(unicode_offset), unicode_length(unicode_length) { + } +}; // struct Word + +inline std::ostream& operator << (std::ostream& os, const Word& w) { + return os << "{\"word\": \"" << w.word << "\", \"offset\": " << w.offset << "}"; +} + +struct RuneStr { + Rune rune; + uint32_t offset; + uint32_t len; + uint32_t unicode_offset; + uint32_t unicode_length; + RuneStr(): rune(0), offset(0), len(0), unicode_offset(0), unicode_length(0) { + } + RuneStr(Rune r, uint32_t o, uint32_t l) + : rune(r), offset(o), len(l), unicode_offset(0), unicode_length(0) { + } + RuneStr(Rune r, uint32_t o, uint32_t l, uint32_t unicode_offset, uint32_t unicode_length) + : rune(r), offset(o), len(l), unicode_offset(unicode_offset), unicode_length(unicode_length) { + } +}; // struct RuneStr + +inline std::ostream& operator << (std::ostream& os, const RuneStr& r) { + return os << "{\"rune\": \"" << r.rune << "\", \"offset\": " << r.offset << ", \"len\": " << r.len << "}"; +} + +typedef limonp::LocalVector Unicode; +typedef limonp::LocalVector RuneStrArray; + +// [left, right] +struct WordRange { + RuneStrArray::const_iterator left; + RuneStrArray::const_iterator right; + WordRange(RuneStrArray::const_iterator l, RuneStrArray::const_iterator r) + : left(l), right(r) { + } + size_t Length() const { + return right - left + 1; + } + bool IsAllAscii() const { + for (RuneStrArray::const_iterator iter = left; iter <= right; ++iter) { + if (iter->rune >= 0x80) { + return false; + } + } + return true; + } +}; // struct WordRange + +struct RuneStrLite { + uint32_t rune; + uint32_t len; + RuneStrLite(): rune(0), len(0) { + } + RuneStrLite(uint32_t r, uint32_t l): rune(r), len(l) { + } +}; // struct RuneStrLite + +inline RuneStrLite DecodeRuneInString(const char* str, size_t len) { + RuneStrLite rp(0, 0); + if (str == NULL || len == 0) { + return rp; + } + if (!(str[0] & 0x80)) { // 0xxxxxxx + // 7bit, total 7bit + rp.rune = (uint8_t)(str[0]) & 0x7f; + rp.len = 1; + } else if ((uint8_t)str[0] <= 0xdf && 1 < len) { + // 110xxxxxx + // 5bit, total 5bit + rp.rune = (uint8_t)(str[0]) & 0x1f; + + // 6bit, total 11bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[1]) & 0x3f; + rp.len = 2; + } else if((uint8_t)str[0] <= 0xef && 2 < len) { // 1110xxxxxx + // 4bit, total 4bit + rp.rune = (uint8_t)(str[0]) & 0x0f; + + // 6bit, total 10bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[1]) & 0x3f; + + // 6bit, total 16bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[2]) & 0x3f; + + rp.len = 3; + } else if((uint8_t)str[0] <= 0xf7 && 3 < len) { // 11110xxxx + // 3bit, total 3bit + rp.rune = (uint8_t)(str[0]) & 0x07; + + // 6bit, total 9bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[1]) & 0x3f; + + // 6bit, total 15bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[2]) & 0x3f; + + // 6bit, total 21bit + rp.rune <<= 6; + rp.rune |= (uint8_t)(str[3]) & 0x3f; + + rp.len = 4; + } else { + rp.rune = 0; + rp.len = 0; + } + return rp; +} + +inline bool DecodeRunesInString(const char* s, size_t len, RuneStrArray& runes) { + runes.clear(); + runes.reserve(len / 2); + for (uint32_t i = 0, j = 0; i < len;) { + RuneStrLite rp = DecodeRuneInString(s + i, len - i); + if (rp.len == 0) { + runes.clear(); + return false; + } + RuneStr x(rp.rune, i, rp.len, j, 1); + runes.push_back(x); + i += rp.len; + ++j; + } + return true; +} + +inline bool DecodeRunesInString(const string& s, RuneStrArray& runes) { + return DecodeRunesInString(s.c_str(), s.size(), runes); +} + +inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) { + unicode.clear(); + RuneStrArray runes; + if (!DecodeRunesInString(s, len, runes)) { + return false; + } + unicode.reserve(runes.size()); + for (size_t i = 0; i < runes.size(); i++) { + unicode.push_back(runes[i].rune); + } + return true; +} + +inline bool IsSingleWord(const string& str) { + RuneStrLite rp = DecodeRuneInString(str.c_str(), str.size()); + return rp.len == str.size(); +} + +inline bool DecodeRunesInString(const string& s, Unicode& unicode) { + return DecodeRunesInString(s.c_str(), s.size(), unicode); +} + +inline Unicode DecodeRunesInString(const string& s) { + Unicode result; + DecodeRunesInString(s, result); + return result; +} + + +// [left, right] +inline Word GetWordFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { + assert(right->offset >= left->offset); + uint32_t len = right->offset - left->offset + right->len; + uint32_t unicode_length = right->unicode_offset - left->unicode_offset + right->unicode_length; + return Word(s.substr(left->offset, len), left->offset, left->unicode_offset, unicode_length); +} + +inline string GetStringFromRunes(const string& s, RuneStrArray::const_iterator left, RuneStrArray::const_iterator right) { + assert(right->offset >= left->offset); + uint32_t len = right->offset - left->offset + right->len; + return s.substr(left->offset, len); +} + +inline void GetWordsFromWordRanges(const string& s, const vector& wrs, vector& words) { + for (size_t i = 0; i < wrs.size(); i++) { + words.push_back(GetWordFromRunes(s, wrs[i].left, wrs[i].right)); + } +} + +inline vector GetWordsFromWordRanges(const string& s, const vector& wrs) { + vector result; + GetWordsFromWordRanges(s, wrs, result); + return result; +} + +inline void GetStringsFromWords(const vector& words, vector& strs) { + strs.resize(words.size()); + for (size_t i = 0; i < words.size(); ++i) { + strs[i] = words[i].word; + } +} + +} // namespace cppjieba + +#endif // CPPJIEBA_UNICODE_H diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ArgvContext.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ArgvContext.hpp new file mode 100644 index 000000000..ba3abe069 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ArgvContext.hpp @@ -0,0 +1,70 @@ +/************************************ + * file enc : ascii + * author : wuyanyi09@gmail.com + ************************************/ + +#ifndef LIMONP_ARGV_FUNCTS_H +#define LIMONP_ARGV_FUNCTS_H + +#include +#include +#include "StringUtil.hpp" + +namespace limonp { + +using namespace std; + +class ArgvContext { + public : + ArgvContext(int argc, const char* const * argv) { + for(int i = 0; i < argc; i++) { + if(StartsWith(argv[i], "-")) { + if(i + 1 < argc && !StartsWith(argv[i + 1], "-")) { + mpss_[argv[i]] = argv[i+1]; + i++; + } else { + sset_.insert(argv[i]); + } + } else { + args_.push_back(argv[i]); + } + } + } + ~ArgvContext() { + } + + friend ostream& operator << (ostream& os, const ArgvContext& args); + string operator [](size_t i) const { + if(i < args_.size()) { + return args_[i]; + } + return ""; + } + string operator [](const string& key) const { + map::const_iterator it = mpss_.find(key); + if(it != mpss_.end()) { + return it->second; + } + return ""; + } + + bool HasKey(const string& key) const { + if(mpss_.find(key) != mpss_.end() || sset_.find(key) != sset_.end()) { + return true; + } + return false; + } + + private: + vector args_; + map mpss_; + set sset_; +}; // class ArgvContext + +inline ostream& operator << (ostream& os, const ArgvContext& args) { + return os< +#include "Condition.hpp" + +namespace limonp { +template +class BlockingQueue: NonCopyable { + public: + BlockingQueue() + : mutex_(), notEmpty_(mutex_), queue_() { + } + + void Push(const T& x) { + MutexLockGuard lock(mutex_); + queue_.push(x); + notEmpty_.Notify(); // Wait morphing saves us + } + + T Pop() { + MutexLockGuard lock(mutex_); + // always use a while-loop, due to spurious wakeup + while (queue_.empty()) { + notEmpty_.Wait(); + } + assert(!queue_.empty()); + T front(queue_.front()); + queue_.pop(); + return front; + } + + size_t Size() const { + MutexLockGuard lock(mutex_); + return queue_.size(); + } + bool Empty() const { + return Size() == 0; + } + + private: + mutable MutexLock mutex_; + Condition notEmpty_; + std::queue queue_; +}; // class BlockingQueue + +} // namespace limonp + +#endif // LIMONP_BLOCKINGQUEUE_HPP diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BoundedBlockingQueue.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BoundedBlockingQueue.hpp new file mode 100644 index 000000000..598d0996a --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BoundedBlockingQueue.hpp @@ -0,0 +1,67 @@ +#ifndef LIMONP_BOUNDED_BLOCKING_QUEUE_HPP +#define LIMONP_BOUNDED_BLOCKING_QUEUE_HPP + +#include "BoundedQueue.hpp" + +namespace limonp { + +template +class BoundedBlockingQueue : NonCopyable { + public: + explicit BoundedBlockingQueue(size_t maxSize) + : mutex_(), + notEmpty_(mutex_), + notFull_(mutex_), + queue_(maxSize) { + } + + void Push(const T& x) { + MutexLockGuard lock(mutex_); + while (queue_.Full()) { + notFull_.Wait(); + } + assert(!queue_.Full()); + queue_.Push(x); + notEmpty_.Notify(); + } + + T Pop() { + MutexLockGuard lock(mutex_); + while (queue_.Empty()) { + notEmpty_.Wait(); + } + assert(!queue_.Empty()); + T res = queue_.Pop(); + notFull_.Notify(); + return res; + } + + bool Empty() const { + MutexLockGuard lock(mutex_); + return queue_.Empty(); + } + + bool Full() const { + MutexLockGuard lock(mutex_); + return queue_.Full(); + } + + size_t size() const { + MutexLockGuard lock(mutex_); + return queue_.size(); + } + + size_t capacity() const { + return queue_.capacity(); + } + + private: + mutable MutexLock mutex_; + Condition notEmpty_; + Condition notFull_; + BoundedQueue queue_; +}; // class BoundedBlockingQueue + +} // namespace limonp + +#endif // LIMONP_BOUNDED_BLOCKING_QUEUE_HPP diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BoundedQueue.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BoundedQueue.hpp new file mode 100644 index 000000000..f52a1079d --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/BoundedQueue.hpp @@ -0,0 +1,65 @@ +#ifndef LIMONP_BOUNDED_QUEUE_HPP +#define LIMONP_BOUNDED_QUEUE_HPP + +#include +#include +#include + +namespace limonp { +using namespace std; +template +class BoundedQueue { + public: + explicit BoundedQueue(size_t capacity): capacity_(capacity), circular_buffer_(capacity) { + head_ = 0; + tail_ = 0; + size_ = 0; + assert(capacity_); + } + ~BoundedQueue() { + } + + void Clear() { + head_ = 0; + tail_ = 0; + size_ = 0; + } + bool Empty() const { + return !size_; + } + bool Full() const { + return capacity_ == size_; + } + size_t Size() const { + return size_; + } + size_t Capacity() const { + return capacity_; + } + + void Push(const T& t) { + assert(!Full()); + circular_buffer_[tail_] = t; + tail_ = (tail_ + 1) % capacity_; + size_ ++; + } + + T Pop() { + assert(!Empty()); + size_t oldPos = head_; + head_ = (head_ + 1) % capacity_; + size_ --; + return circular_buffer_[oldPos]; + } + + private: + size_t head_; + size_t tail_; + size_t size_; + const size_t capacity_; + vector circular_buffer_; + +}; // class BoundedQueue +} // namespace limonp + +#endif diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Closure.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Closure.hpp new file mode 100644 index 000000000..c9d9dd49c --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Closure.hpp @@ -0,0 +1,206 @@ +#ifndef LIMONP_CLOSURE_HPP +#define LIMONP_CLOSURE_HPP + +namespace limonp { + +class ClosureInterface { + public: + virtual ~ClosureInterface() { + } + virtual void Run() = 0; +}; + +template +class Closure0: public ClosureInterface { + public: + Closure0(Funct fun) { + fun_ = fun; + } + virtual ~Closure0() { + } + virtual void Run() { + (*fun_)(); + } + private: + Funct fun_; +}; + +template +class Closure1: public ClosureInterface { + public: + Closure1(Funct fun, Arg1 arg1) { + fun_ = fun; + arg1_ = arg1; + } + virtual ~Closure1() { + } + virtual void Run() { + (*fun_)(arg1_); + } + private: + Funct fun_; + Arg1 arg1_; +}; + +template +class Closure2: public ClosureInterface { + public: + Closure2(Funct fun, Arg1 arg1, Arg2 arg2) { + fun_ = fun; + arg1_ = arg1; + arg2_ = arg2; + } + virtual ~Closure2() { + } + virtual void Run() { + (*fun_)(arg1_, arg2_); + } + private: + Funct fun_; + Arg1 arg1_; + Arg2 arg2_; +}; + +template +class Closure3: public ClosureInterface { + public: + Closure3(Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) { + fun_ = fun; + arg1_ = arg1; + arg2_ = arg2; + arg3_ = arg3; + } + virtual ~Closure3() { + } + virtual void Run() { + (*fun_)(arg1_, arg2_, arg3_); + } + private: + Funct fun_; + Arg1 arg1_; + Arg2 arg2_; + Arg3 arg3_; +}; + +template +class ObjClosure0: public ClosureInterface { + public: + ObjClosure0(Obj* p, Funct fun) { + p_ = p; + fun_ = fun; + } + virtual ~ObjClosure0() { + } + virtual void Run() { + (p_->*fun_)(); + } + private: + Obj* p_; + Funct fun_; +}; + +template +class ObjClosure1: public ClosureInterface { + public: + ObjClosure1(Obj* p, Funct fun, Arg1 arg1) { + p_ = p; + fun_ = fun; + arg1_ = arg1; + } + virtual ~ObjClosure1() { + } + virtual void Run() { + (p_->*fun_)(arg1_); + } + private: + Obj* p_; + Funct fun_; + Arg1 arg1_; +}; + +template +class ObjClosure2: public ClosureInterface { + public: + ObjClosure2(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2) { + p_ = p; + fun_ = fun; + arg1_ = arg1; + arg2_ = arg2; + } + virtual ~ObjClosure2() { + } + virtual void Run() { + (p_->*fun_)(arg1_, arg2_); + } + private: + Obj* p_; + Funct fun_; + Arg1 arg1_; + Arg2 arg2_; +}; +template +class ObjClosure3: public ClosureInterface { + public: + ObjClosure3(Obj* p, Funct fun, Arg1 arg1, Arg2 arg2, Arg3 arg3) { + p_ = p; + fun_ = fun; + arg1_ = arg1; + arg2_ = arg2; + arg3_ = arg3; + } + virtual ~ObjClosure3() { + } + virtual void Run() { + (p_->*fun_)(arg1_, arg2_, arg3_); + } + private: + Obj* p_; + Funct fun_; + Arg1 arg1_; + Arg2 arg2_; + Arg3 arg3_; +}; + +template +ClosureInterface* NewClosure(R (*fun)()) { + return new Closure0(fun); +} + +template +ClosureInterface* NewClosure(R (*fun)(Arg1), Arg1 arg1) { + return new Closure1(fun, arg1); +} + +template +ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) { + return new Closure2(fun, arg1, arg2); +} + +template +ClosureInterface* NewClosure(R (*fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) { + return new Closure3(fun, arg1, arg2, arg3); +} + +template +ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)()) { + return new ObjClosure0(obj, fun); +} + +template +ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1), Arg1 arg1) { + return new ObjClosure1(obj, fun, arg1); +} + +template +ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2), Arg1 arg1, Arg2 arg2) { + return new ObjClosure2(obj, fun, arg1, arg2); +} + +template +ClosureInterface* NewClosure(Obj* obj, R (Obj::* fun)(Arg1, Arg2, Arg3), Arg1 arg1, Arg2 arg2, Arg3 arg3) { + return new ObjClosure3(obj, fun, arg1, arg2, arg3); +} + +} // namespace limonp + +#endif // LIMONP_CLOSURE_HPP diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Colors.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Colors.hpp new file mode 100644 index 000000000..04edd7eb8 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Colors.hpp @@ -0,0 +1,31 @@ +#ifndef LIMONP_COLOR_PRINT_HPP +#define LIMONP_COLOR_PRINT_HPP + +#include +#include + +namespace limonp { + +using std::string; + +enum Color { + BLACK = 30, + RED, + GREEN, + YELLOW, + BLUE, + PURPLE +}; // enum Color + +static void ColorPrintln(enum Color color, const char * fmt, ...) { + va_list ap; + printf("\033[0;%dm", color); + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); + printf("\033[0m\n"); // if not \n , in some situation , the next lines will be set the same color unexpectedly +} + +} // namespace limonp + +#endif // LIMONP_COLOR_PRINT_HPP diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Condition.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Condition.hpp new file mode 100644 index 000000000..656a61d7a --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Condition.hpp @@ -0,0 +1,38 @@ +#ifndef LIMONP_CONDITION_HPP +#define LIMONP_CONDITION_HPP + +#include "MutexLock.hpp" + +namespace limonp { + +class Condition : NonCopyable { + public: + explicit Condition(MutexLock& mutex) + : mutex_(mutex) { + XCHECK(!pthread_cond_init(&pcond_, NULL)); + } + + ~Condition() { + XCHECK(!pthread_cond_destroy(&pcond_)); + } + + void Wait() { + XCHECK(!pthread_cond_wait(&pcond_, mutex_.GetPthreadMutex())); + } + + void Notify() { + XCHECK(!pthread_cond_signal(&pcond_)); + } + + void NotifyAll() { + XCHECK(!pthread_cond_broadcast(&pcond_)); + } + + private: + MutexLock& mutex_; + pthread_cond_t pcond_; +}; // class Condition + +} // namespace limonp + +#endif // LIMONP_CONDITION_HPP diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Config.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Config.hpp new file mode 100644 index 000000000..c98f22277 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Config.hpp @@ -0,0 +1,103 @@ +/************************************ + * file enc : utf8 + * author : wuyanyi09@gmail.com + ************************************/ +#ifndef LIMONP_CONFIG_H +#define LIMONP_CONFIG_H + +#include +#include +#include +#include +#include "StringUtil.hpp" + +namespace limonp { + +using namespace std; + +class Config { + public: + explicit Config(const string& filePath) { + LoadFile(filePath); + } + + operator bool () { + return !map_.empty(); + } + + string Get(const string& key, const string& defaultvalue) const { + map::const_iterator it = map_.find(key); + if(map_.end() != it) { + return it->second; + } + return defaultvalue; + } + int Get(const string& key, int defaultvalue) const { + string str = Get(key, ""); + if("" == str) { + return defaultvalue; + } + return atoi(str.c_str()); + } + const char* operator [] (const char* key) const { + if(NULL == key) { + return NULL; + } + map::const_iterator it = map_.find(key); + if(map_.end() != it) { + return it->second.c_str(); + } + return NULL; + } + + string GetConfigInfo() const { + string res; + res << *this; + return res; + } + + private: + void LoadFile(const string& filePath) { + ifstream ifs(filePath.c_str()); + assert(ifs); + string line; + vector vecBuf; + size_t lineno = 0; + while(getline(ifs, line)) { + lineno ++; + Trim(line); + if(line.empty() || StartsWith(line, "#")) { + continue; + } + vecBuf.clear(); + Split(line, vecBuf, "="); + if(2 != vecBuf.size()) { + fprintf(stderr, "line[%s] illegal.\n", line.c_str()); + assert(false); + continue; + } + string& key = vecBuf[0]; + string& value = vecBuf[1]; + Trim(key); + Trim(value); + if(!map_.insert(make_pair(key, value)).second) { + fprintf(stderr, "key[%s] already exits.\n", key.c_str()); + assert(false); + continue; + } + } + ifs.close(); + } + + friend ostream& operator << (ostream& os, const Config& config); + + map map_; +}; // class Config + +inline ostream& operator << (ostream& os, const Config& config) { + return os << config.map_; +} + +} // namespace limonp + +#endif // LIMONP_CONFIG_H diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/FileLock.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/FileLock.hpp new file mode 100644 index 000000000..56a478aa5 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/FileLock.hpp @@ -0,0 +1,74 @@ +#ifndef LIMONP_FILELOCK_HPP +#define LIMONP_FILELOCK_HPP + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace limonp { + +using std::string; + +class FileLock { + public: + FileLock() : fd_(-1), ok_(true) { + } + ~FileLock() { + if(fd_ > 0) { + Close(); + } + } + void Open(const string& fname) { + assert(fd_ == -1); + fd_ = open(fname.c_str(), O_RDWR | O_CREAT, 0644); + if(fd_ < 0) { + ok_ = false; + err_ = strerror(errno); + } + } + void Close() { + ::close(fd_); + } + void Lock() { + if(LockOrUnlock(fd_, true) < 0) { + ok_ = false; + err_ = strerror(errno); + } + } + void UnLock() { + if(LockOrUnlock(fd_, false) < 0) { + ok_ = false; + err_ = strerror(errno); + } + } + bool Ok() const { + return ok_; + } + string Error() const { + return err_; + } + private: + static int LockOrUnlock(int fd, bool lock) { + errno = 0; + struct flock f; + memset(&f, 0, sizeof(f)); + f.l_type = (lock ? F_WRLCK : F_UNLCK); + f.l_whence = SEEK_SET; + f.l_start = 0; + f.l_len = 0; // Lock/unlock entire file + return fcntl(fd, F_SETLK, &f); + } + + int fd_; + bool ok_; + string err_; +}; // class FileLock + +}// namespace limonp + +#endif // LIMONP_FILELOCK_HPP diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ForcePublic.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ForcePublic.hpp new file mode 100644 index 000000000..20766820a --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ForcePublic.hpp @@ -0,0 +1,7 @@ +#ifndef LIMONP_FORCE_PUBLIC_H +#define LIMONP_FORCE_PUBLIC_H + +#define private public +#define protected public + +#endif // LIMONP_FORCE_PUBLIC_H diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/LocalVector.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/LocalVector.hpp new file mode 100644 index 000000000..11339cc8e --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/LocalVector.hpp @@ -0,0 +1,139 @@ +#ifndef LIMONP_LOCAL_VECTOR_HPP +#define LIMONP_LOCAL_VECTOR_HPP + +#include +#include +#include +#include + +namespace limonp { +using namespace std; +/* + * LocalVector : T must be primitive type (char , int, size_t), if T is struct or class, LocalVector may be dangerous.. + * LocalVector is simple and not well-tested. + */ +const size_t LOCAL_VECTOR_BUFFER_SIZE = 16; +template +class LocalVector { + public: + typedef const T* const_iterator ; + typedef T value_type; + typedef size_t size_type; + private: + T buffer_[LOCAL_VECTOR_BUFFER_SIZE]; + T * ptr_; + size_t size_; + size_t capacity_; + public: + LocalVector() { + init_(); + }; + LocalVector(const LocalVector& vec) { + init_(); + *this = vec; + } + LocalVector(const_iterator begin, const_iterator end) { // TODO: make it faster + init_(); + while(begin != end) { + push_back(*begin++); + } + } + LocalVector(size_t size, const T& t) { // TODO: make it faster + init_(); + while(size--) { + push_back(t); + } + } + ~LocalVector() { + if(ptr_ != buffer_) { + free(ptr_); + } + }; + public: + LocalVector& operator = (const LocalVector& vec) { + clear(); + size_ = vec.size(); + capacity_ = vec.capacity(); + if(vec.buffer_ == vec.ptr_) { + memcpy(static_cast(buffer_), vec.buffer_, sizeof(T) * size_); + ptr_ = buffer_; + } else { + ptr_ = (T*) malloc(vec.capacity() * sizeof(T)); + assert(ptr_); + memcpy(static_cast(ptr_), vec.ptr_, vec.size() * sizeof(T)); + } + return *this; + } + private: + void init_() { + ptr_ = buffer_; + size_ = 0; + capacity_ = LOCAL_VECTOR_BUFFER_SIZE; + } + public: + T& operator [] (size_t i) { + return ptr_[i]; + } + const T& operator [] (size_t i) const { + return ptr_[i]; + } + void push_back(const T& t) { + if(size_ == capacity_) { + assert(capacity_); + reserve(capacity_ * 2); + } + ptr_[size_ ++ ] = t; + } + void reserve(size_t size) { + if(size <= capacity_) { + return; + } + T * next = (T*)malloc(sizeof(T) * size); + assert(next); + T * old = ptr_; + ptr_ = next; + memcpy(static_cast(ptr_), old, sizeof(T) * capacity_); + capacity_ = size; + if(old != buffer_) { + free(old); + } + } + bool empty() const { + return 0 == size(); + } + size_t size() const { + return size_; + } + size_t capacity() const { + return capacity_; + } + const_iterator begin() const { + return ptr_; + } + const_iterator end() const { + return ptr_ + size_; + } + void clear() { + if(ptr_ != buffer_) { + free(ptr_); + } + init_(); + } +}; + +template +ostream & operator << (ostream& os, const LocalVector& vec) { + if(vec.empty()) { + return os << "[]"; + } + os<<"[\""< +#include +#include +#include +#include + +#ifdef XLOG +#error "XLOG has been defined already" +#endif // XLOG +#ifdef XCHECK +#error "XCHECK has been defined already" +#endif // XCHECK + +#define XLOG(level) limonp::Logger(limonp::LL_##level, __FILE__, __LINE__).Stream() +#define XCHECK(exp) if(!(exp)) XLOG(FATAL) << "exp: ["#exp << "] false. " + +namespace limonp { + +enum { + LL_DEBUG = 0, + LL_INFO = 1, + LL_WARNING = 2, + LL_ERROR = 3, + LL_FATAL = 4, +}; // enum + +static const char * LOG_LEVEL_ARRAY[] = {"DEBUG","INFO","WARN","ERROR","FATAL"}; +static const char * LOG_TIME_FORMAT = "%Y-%m-%d %H:%M:%S"; + +class Logger { + public: + Logger(size_t level, const char* filename, int lineno) + : level_(level) { +#ifdef LOGGING_LEVEL + if (level_ < LOGGING_LEVEL) { + return; + } +#endif + assert(level_ <= sizeof(LOG_LEVEL_ARRAY)/sizeof(*LOG_LEVEL_ARRAY)); + + char buf[32]; + + time_t timeNow; + time(&timeNow); + + struct tm tmNow; + + #if defined(_WIN32) || defined(_WIN64) + errno_t e = localtime_s(&tmNow, &timeNow); + assert(e = 0); + #else + struct tm * tm_tmp = localtime_r(&timeNow, &tmNow); + assert(tm_tmp != nullptr); + #endif + + strftime(buf, sizeof(buf), LOG_TIME_FORMAT, &tmNow); + + stream_ << buf + << " " << filename + << ":" << lineno + << " " << LOG_LEVEL_ARRAY[level_] + << " "; + } + ~Logger() { +#ifdef LOGGING_LEVEL + if (level_ < LOGGING_LEVEL) { + return; + } +#endif + std::cerr << stream_.str() << std::endl; + if (level_ == LL_FATAL) { + abort(); + } + } + + std::ostream& Stream() { + return stream_; + } + + private: + std::ostringstream stream_; + size_t level_; +}; // class Logger + +} // namespace limonp + +#endif // LIMONP_LOGGING_HPP diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Md5.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Md5.hpp new file mode 100644 index 000000000..d30f3b545 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Md5.hpp @@ -0,0 +1,411 @@ +#ifndef __MD5_H__ +#define __MD5_H__ + +// Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All +// rights reserved. + +// License to copy and use this software is granted provided that it +// is identified as the "RSA Data Security, Inc. MD5 Message-Digest +// Algorithm" in all material mentioning or referencing this software +// or this function. +// +// License is also granted to make and use derivative works provided +// that such works are identified as "derived from the RSA Data +// Security, Inc. MD5 Message-Digest Algorithm" in all material +// mentioning or referencing the derived work. +// +// RSA Data Security, Inc. makes no representations concerning either +// the merchantability of this software or the suitability of this +// software for any particular purpose. It is provided "as is" +// without express or implied warranty of any kind. +// +// These notices must be retained in any copies of any part of this +// documentation and/or software. + + + +// The original md5 implementation avoids external libraries. +// This version has dependency on stdio.h for file input and +// string.h for memcpy. +#include +#include +#include + +namespace limonp { + +//#pragma region MD5 defines +// Constants for MD5Transform routine. +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + + +// F, G, H and I are basic MD5 functions. +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +// ROTATE_LEFT rotates x left n bits. +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +// FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. +// Rotation is separate from addition to prevent recomputation. +#define FF(a, b, c, d, x, s, ac) { \ + (a) += F ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) { \ + (a) += G ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) { \ + (a) += H ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) { \ + (a) += I ((b), (c), (d)) + (x) + (UINT4)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +//#pragma endregion + + +typedef unsigned char BYTE ; + +// POINTER defines a generic pointer type +typedef unsigned char *POINTER; + +// UINT2 defines a two byte word +typedef unsigned short int UINT2; + +// UINT4 defines a four byte word +typedef unsigned int UINT4; + +static unsigned char PADDING[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; +// convenient object that wraps +// the C-functions for use in C++ only +class MD5 { + private: + struct __context_t { + UINT4 state[4]; /* state (ABCD) */ + UINT4 count[2]; /* number of bits, modulo 2^64 (lsb first) */ + unsigned char buffer[64]; /* input buffer */ + } context ; + + //#pragma region static helper functions + // The core of the MD5 algorithm is here. + // MD5 basic transformation. Transforms state based on block. + static void MD5Transform( UINT4 state[4], unsigned char block[64] ) { + UINT4 a = state[0], b = state[1], c = state[2], d = state[3], x[16]; + + Decode (x, block, 64); + + /* Round 1 */ + FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ + FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ + FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ + FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ + FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ + FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ + FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ + FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ + FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ + FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ + FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ + FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ + FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ + FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ + FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ + FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ + + /* Round 2 */ + GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ + GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ + GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ + GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ + GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ + GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ + GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ + GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ + GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ + GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ + GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ + GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ + GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ + GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ + GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ + GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ + + /* Round 3 */ + HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ + HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ + HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ + HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ + HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ + HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ + HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ + HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ + HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ + HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ + HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ + HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ + HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ + HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ + HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ + HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ + + /* Round 4 */ + II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ + II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ + II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ + II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ + II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ + II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ + II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ + II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ + II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ + II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ + II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ + II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ + II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ + II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ + II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ + II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + + // Zeroize sensitive information. + memset((POINTER)x, 0, sizeof (x)); + } + + // Encodes input (UINT4) into output (unsigned char). Assumes len is + // a multiple of 4. + static void Encode( unsigned char *output, UINT4 *input, unsigned int len ) { + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) { + output[j] = (unsigned char)(input[i] & 0xff); + output[j+1] = (unsigned char)((input[i] >> 8) & 0xff); + output[j+2] = (unsigned char)((input[i] >> 16) & 0xff); + output[j+3] = (unsigned char)((input[i] >> 24) & 0xff); + } + } + + // Decodes input (unsigned char) into output (UINT4). Assumes len is + // a multiple of 4. + static void Decode( UINT4 *output, unsigned char *input, unsigned int len ) { + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) + output[i] = ((UINT4)input[j]) | (((UINT4)input[j+1]) << 8) | + (((UINT4)input[j+2]) << 16) | (((UINT4)input[j+3]) << 24); + } + //#pragma endregion + + + public: + // MAIN FUNCTIONS + MD5() { + Init() ; + } + + // MD5 initialization. Begins an MD5 operation, writing a new context. + void Init() { + context.count[0] = context.count[1] = 0; + + // Load magic initialization constants. + context.state[0] = 0x67452301; + context.state[1] = 0xefcdab89; + context.state[2] = 0x98badcfe; + context.state[3] = 0x10325476; + } + + // MD5 block update operation. Continues an MD5 message-digest + // operation, processing another message block, and updating the + // context. + void Update( + unsigned char *input, // input block + unsigned int inputLen ) { // length of input block + unsigned int i, index, partLen; + + // Compute number of bytes mod 64 + index = (unsigned int)((context.count[0] >> 3) & 0x3F); + + // Update number of bits + if ((context.count[0] += ((UINT4)inputLen << 3)) + < ((UINT4)inputLen << 3)) + context.count[1]++; + context.count[1] += ((UINT4)inputLen >> 29); + + partLen = 64 - index; + + // Transform as many times as possible. + if (inputLen >= partLen) { + memcpy((POINTER)&context.buffer[index], (POINTER)input, partLen); + MD5Transform (context.state, context.buffer); + + for (i = partLen; i + 63 < inputLen; i += 64) + MD5Transform (context.state, &input[i]); + + index = 0; + } else + i = 0; + + /* Buffer remaining input */ + memcpy((POINTER)&context.buffer[index], (POINTER)&input[i], inputLen-i); + } + + // MD5 finalization. Ends an MD5 message-digest operation, writing the + // the message digest and zeroizing the context. + // Writes to digestRaw + void Final() { + unsigned char bits[8]; + unsigned int index, padLen; + + // Save number of bits + Encode( bits, context.count, 8 ); + + // Pad out to 56 mod 64. + index = (unsigned int)((context.count[0] >> 3) & 0x3f); + padLen = (index < 56) ? (56 - index) : (120 - index); + Update( PADDING, padLen ); + + // Append length (before padding) + Update( bits, 8 ); + + // Store state in digest + Encode( digestRaw, context.state, 16); + + // Zeroize sensitive information. + memset((POINTER)&context, 0, sizeof (context)); + + writeToString() ; + } + + /// Buffer must be 32+1 (nul) = 33 chars long at least + void writeToString() { + int pos ; + + for( pos = 0 ; pos < 16 ; pos++ ) + sprintf( digestChars+(pos*2), "%02x", digestRaw[pos] ) ; + } + + + public: + // an MD5 digest is a 16-byte number (32 hex digits) + BYTE digestRaw[ 16 ] ; + + // This version of the digest is actually + // a "printf'd" version of the digest. + char digestChars[ 33 ] ; + + /// Load a file from disk and digest it + // Digests a file and returns the result. + const char* digestFile( const char *filename ) { + if (NULL == filename || strcmp(filename, "") == 0) + return NULL; + + Init() ; + + FILE *file; + + unsigned char buffer[1024] ; + + if((file = fopen (filename, "rb")) == NULL) { + return NULL; + } + int len; + while( (len = fread( buffer, 1, 1024, file )) ) + Update( buffer, len ) ; + Final(); + + fclose( file ); + + return digestChars ; + } + + /// Digests a byte-array already in memory + const char* digestMemory( BYTE *memchunk, int len ) { + if (NULL == memchunk) + return NULL; + + Init() ; + Update( memchunk, len ) ; + Final() ; + + return digestChars ; + } + + // Digests a string and prints the result. + const char* digestString(const char *string ) { + if (string == NULL) + return NULL; + + Init() ; + Update( (unsigned char*)string, strlen(string) ) ; + Final() ; + + return digestChars ; + } +}; + +inline bool md5String(const char* str, std::string& res) { + if (NULL == str) { + res = ""; + return false; + } + + MD5 md5; + const char *pRes = md5.digestString(str); + if (NULL == pRes) { + res = ""; + return false; + } + + res = pRes; + return true; +} + +inline bool md5File(const char* filepath, std::string& res) { + if (NULL == filepath || strcmp(filepath, "") == 0) { + res = ""; + return false; + } + + MD5 md5; + const char *pRes = md5.digestFile(filepath); + + if (NULL == pRes) { + res = ""; + return false; + } + + res = pRes; + return true; +} +} +#endif diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/MutexLock.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/MutexLock.hpp new file mode 100644 index 000000000..ea10d6dee --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/MutexLock.hpp @@ -0,0 +1,51 @@ +#ifndef LIMONP_MUTEX_LOCK_HPP +#define LIMONP_MUTEX_LOCK_HPP + +#include +#include "NonCopyable.hpp" +#include "Logging.hpp" + +namespace limonp { + +class MutexLock: NonCopyable { + public: + MutexLock() { + XCHECK(!pthread_mutex_init(&mutex_, NULL)); + } + ~MutexLock() { + XCHECK(!pthread_mutex_destroy(&mutex_)); + } + pthread_mutex_t* GetPthreadMutex() { + return &mutex_; + } + + private: + void Lock() { + XCHECK(!pthread_mutex_lock(&mutex_)); + } + void Unlock() { + XCHECK(!pthread_mutex_unlock(&mutex_)); + } + friend class MutexLockGuard; + + pthread_mutex_t mutex_; +}; // class MutexLock + +class MutexLockGuard: NonCopyable { + public: + explicit MutexLockGuard(MutexLock & mutex) + : mutex_(mutex) { + mutex_.Lock(); + } + ~MutexLockGuard() { + mutex_.Unlock(); + } + private: + MutexLock & mutex_; +}; // class MutexLockGuard + +#define MutexLockGuard(x) XCHECK(false); + +} // namespace limonp + +#endif // LIMONP_MUTEX_LOCK_HPP diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/NonCopyable.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/NonCopyable.hpp new file mode 100644 index 000000000..145400f4b --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/NonCopyable.hpp @@ -0,0 +1,21 @@ +/************************************ + ************************************/ +#ifndef LIMONP_NONCOPYABLE_H +#define LIMONP_NONCOPYABLE_H + +namespace limonp { + +class NonCopyable { + protected: + NonCopyable() { + } + ~NonCopyable() { + } + private: + NonCopyable(const NonCopyable& ); + const NonCopyable& operator=(const NonCopyable& ); +}; // class NonCopyable + +} // namespace limonp + +#endif // LIMONP_NONCOPYABLE_H diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/StdExtension.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/StdExtension.hpp new file mode 100644 index 000000000..cf00e9416 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/StdExtension.hpp @@ -0,0 +1,157 @@ +#ifndef LIMONP_STD_EXTEMSION_HPP +#define LIMONP_STD_EXTEMSION_HPP + +#include + +#ifdef __APPLE__ +#include +#include +#elif(__cplusplus >= 201103L) +#include +#include +#elif defined _MSC_VER +#include +#include +#else +#include +#include +namespace std { +using std::tr1::unordered_map; +using std::tr1::unordered_set; +} + +#endif + +#include +#include +#include +#include +#include +#include + +namespace std { + +template +ostream& operator << (ostream& os, const vector& v) { + if(v.empty()) { + return os << "[]"; + } + os<<"["< +inline ostream& operator << (ostream& os, const vector& v) { + if(v.empty()) { + return os << "[]"; + } + os<<"[\""< +ostream& operator << (ostream& os, const deque& dq) { + if(dq.empty()) { + return os << "[]"; + } + os<<"[\""< +ostream& operator << (ostream& os, const pair& pr) { + os << pr.first << ":" << pr.second ; + return os; +} + + +template +string& operator << (string& str, const T& obj) { + stringstream ss; + ss << obj; // call ostream& operator << (ostream& os, + return str = ss.str(); +} + +template +ostream& operator << (ostream& os, const map& mp) { + if(mp.empty()) { + os<<"{}"; + return os; + } + os<<'{'; + typename map::const_iterator it = mp.begin(); + os<<*it; + it++; + while(it != mp.end()) { + os<<", "<<*it; + it++; + } + os<<'}'; + return os; +} +template +ostream& operator << (ostream& os, const std::unordered_map& mp) { + if(mp.empty()) { + return os << "{}"; + } + os<<'{'; + typename std::unordered_map::const_iterator it = mp.begin(); + os<<*it; + it++; + while(it != mp.end()) { + os<<", "<<*it++; + } + return os<<'}'; +} + +template +ostream& operator << (ostream& os, const set& st) { + if(st.empty()) { + os << "{}"; + return os; + } + os<<'{'; + typename set::const_iterator it = st.begin(); + os<<*it; + it++; + while(it != st.end()) { + os<<", "<<*it; + it++; + } + os<<'}'; + return os; +} + +template +bool IsIn(const ContainType& contain, const KeyType& key) { + return contain.end() != contain.find(key); +} + +template +basic_string & operator << (basic_string & s, ifstream & ifs) { + return s.assign((istreambuf_iterator(ifs)), istreambuf_iterator()); +} + +template +ofstream & operator << (ofstream & ofs, const basic_string& s) { + ostreambuf_iterator itr (ofs); + copy(s.begin(), s.end(), itr); + return ofs; +} + +} // namespace std + +#endif diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/StringUtil.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/StringUtil.hpp new file mode 100644 index 000000000..ad3be56f3 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/StringUtil.hpp @@ -0,0 +1,405 @@ +/************************************ + * file enc : ascii + * author : wuyanyi09@gmail.com + ************************************/ +#ifndef LIMONP_STR_FUNCTS_H +#define LIMONP_STR_FUNCTS_H +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "StdExtension.hpp" + +namespace limonp { +using namespace std; +inline string StringFormat(const char* fmt, ...) { + int size = 256; + std::string str; + va_list ap; + while (1) { + str.resize(size); + va_start(ap, fmt); + int n = vsnprintf((char *)str.c_str(), size, fmt, ap); + va_end(ap); + if (n > -1 && n < size) { + str.resize(n); + return str; + } + if (n > -1) + size = n + 1; + else + size *= 2; + } + return str; +} + +template +void Join(T begin, T end, string& res, const string& connector) { + if(begin == end) { + return; + } + stringstream ss; + ss<<*begin; + begin++; + while(begin != end) { + ss << connector << *begin; + begin ++; + } + res = ss.str(); +} + +template +string Join(T begin, T end, const string& connector) { + string res; + Join(begin ,end, res, connector); + return res; +} + +inline string& Upper(string& str) { + transform(str.begin(), str.end(), str.begin(), (int (*)(int))toupper); + return str; +} + +inline string& Lower(string& str) { + transform(str.begin(), str.end(), str.begin(), (int (*)(int))tolower); + return str; +} + +inline bool IsSpace(unsigned c) { + // when passing large int as the argument of isspace, it core dump, so here need a type cast. + return c > 0xff ? false : std::isspace(c & 0xff) != 0; +} + +inline std::string& LTrim(std::string &s) { +#if defined(_MSC_VER) && _MSC_VER >= 1910 + s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) { + return !std::isspace(ch); + })); +#else + // Use lower version of MSVC + s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::ptr_fun(IsSpace)))); +#endif + return s; +} + +inline std::string& RTrim(std::string &s) { +#if defined(_MSC_VER) && _MSC_VER >= 1910 + // Use MSVC 2017 or higher version + s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { + return !std::isspace(ch); + }).base(), s.end()); +#else + // Use lower version of MSVC + s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::ptr_fun(IsSpace))).base(), s.end()); +#endif + return s; +} + +inline std::string& Trim(std::string &s) { + return LTrim(RTrim(s)); +} + +inline std::string& LTrim(std::string& s, char x) { +#if defined(_MSC_VER) && _MSC_VER >= 1910 + s.erase(s.begin(), std::find_if(s.begin(), s.end(), + [x](unsigned char c) { return !std::isspace(c) && c != x; })); +#else + s.erase(s.begin(), std::find_if(s.begin(), s.end(), std::not1(std::bind2nd(std::equal_to(), x)))); +#endif + return s; +} + +inline std::string& RTrim(std::string& s, char x) { +#if defined(_MSC_VER) && _MSC_VER >= 1910 + s.erase(std::find_if(s.rbegin(), s.rend(), + [x](unsigned char c) { return !std::isspace(c) && c != x; }).base(), s.end()); +#else + s.erase(std::find_if(s.rbegin(), s.rend(), std::not1(std::bind2nd(std::equal_to(), x))).base(), s.end()); +#endif + return s; +} + +inline std::string& Trim(std::string &s, char x) { + return LTrim(RTrim(s, x), x); +} + +inline void Split(const string& src, vector& res, const string& pattern, size_t maxsplit = string::npos) { + res.clear(); + size_t Start = 0; + size_t end = 0; + string sub; + while(Start < src.size()) { + end = src.find_first_of(pattern, Start); + if(string::npos == end || res.size() >= maxsplit) { + sub = src.substr(Start); + res.push_back(sub); + return; + } + sub = src.substr(Start, end - Start); + res.push_back(sub); + Start = end + 1; + } + return; +} + +inline vector Split(const string& src, const string& pattern, size_t maxsplit = string::npos) { + vector res; + Split(src, res, pattern, maxsplit); + return res; +} + +inline bool StartsWith(const string& str, const string& prefix) { + if(prefix.length() > str.length()) { + return false; + } + return 0 == str.compare(0, prefix.length(), prefix); +} + +inline bool EndsWith(const string& str, const string& suffix) { + if(suffix.length() > str.length()) { + return false; + } + return 0 == str.compare(str.length() - suffix.length(), suffix.length(), suffix); +} + +inline bool IsInStr(const string& str, char ch) { + return str.find(ch) != string::npos; +} + +inline uint16_t TwocharToUint16(char high, char low) { + return (((uint16_t(high) & 0x00ff ) << 8) | (uint16_t(low) & 0x00ff)); +} + +template +bool Utf8ToUnicode(const char * const str, size_t len, Uint16Container& vec) { + if(!str) { + return false; + } + char ch1, ch2; + uint16_t tmp; + vec.clear(); + for(size_t i = 0; i < len;) { + if(!(str[i] & 0x80)) { // 0xxxxxxx + vec.push_back(str[i]); + i++; + } else if ((uint8_t)str[i] <= 0xdf && i + 1 < len) { // 110xxxxxx + ch1 = (str[i] >> 2) & 0x07; + ch2 = (str[i+1] & 0x3f) | ((str[i] & 0x03) << 6 ); + tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff)); + vec.push_back(tmp); + i += 2; + } else if((uint8_t)str[i] <= 0xef && i + 2 < len) { + ch1 = ((uint8_t)str[i] << 4) | ((str[i+1] >> 2) & 0x0f ); + ch2 = (((uint8_t)str[i+1]<<6) & 0xc0) | (str[i+2] & 0x3f); + tmp = (((uint16_t(ch1) & 0x00ff ) << 8) | (uint16_t(ch2) & 0x00ff)); + vec.push_back(tmp); + i += 3; + } else { + return false; + } + } + return true; +} + +template +bool Utf8ToUnicode(const string& str, Uint16Container& vec) { + return Utf8ToUnicode(str.c_str(), str.size(), vec); +} + +template +bool Utf8ToUnicode32(const string& str, Uint32Container& vec) { + uint32_t tmp; + vec.clear(); + for(size_t i = 0; i < str.size();) { + if(!(str[i] & 0x80)) { // 0xxxxxxx + // 7bit, total 7bit + tmp = (uint8_t)(str[i]) & 0x7f; + i++; + } else if ((uint8_t)str[i] <= 0xdf && i + 1 < str.size()) { // 110xxxxxx + // 5bit, total 5bit + tmp = (uint8_t)(str[i]) & 0x1f; + + // 6bit, total 11bit + tmp <<= 6; + tmp |= (uint8_t)(str[i+1]) & 0x3f; + i += 2; + } else if((uint8_t)str[i] <= 0xef && i + 2 < str.size()) { // 1110xxxxxx + // 4bit, total 4bit + tmp = (uint8_t)(str[i]) & 0x0f; + + // 6bit, total 10bit + tmp <<= 6; + tmp |= (uint8_t)(str[i+1]) & 0x3f; + + // 6bit, total 16bit + tmp <<= 6; + tmp |= (uint8_t)(str[i+2]) & 0x3f; + + i += 3; + } else if((uint8_t)str[i] <= 0xf7 && i + 3 < str.size()) { // 11110xxxx + // 3bit, total 3bit + tmp = (uint8_t)(str[i]) & 0x07; + + // 6bit, total 9bit + tmp <<= 6; + tmp |= (uint8_t)(str[i+1]) & 0x3f; + + // 6bit, total 15bit + tmp <<= 6; + tmp |= (uint8_t)(str[i+2]) & 0x3f; + + // 6bit, total 21bit + tmp <<= 6; + tmp |= (uint8_t)(str[i+3]) & 0x3f; + + i += 4; + } else { + return false; + } + vec.push_back(tmp); + } + return true; +} + +template +void Unicode32ToUtf8(Uint32ContainerConIter begin, Uint32ContainerConIter end, string& res) { + res.clear(); + uint32_t ui; + while(begin != end) { + ui = *begin; + if(ui <= 0x7f) { + res += char(ui); + } else if(ui <= 0x7ff) { + res += char(((ui >> 6) & 0x1f) | 0xc0); + res += char((ui & 0x3f) | 0x80); + } else if(ui <= 0xffff) { + res += char(((ui >> 12) & 0x0f) | 0xe0); + res += char(((ui >> 6) & 0x3f) | 0x80); + res += char((ui & 0x3f) | 0x80); + } else { + res += char(((ui >> 18) & 0x03) | 0xf0); + res += char(((ui >> 12) & 0x3f) | 0x80); + res += char(((ui >> 6) & 0x3f) | 0x80); + res += char((ui & 0x3f) | 0x80); + } + begin ++; + } +} + +template +void UnicodeToUtf8(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) { + res.clear(); + uint16_t ui; + while(begin != end) { + ui = *begin; + if(ui <= 0x7f) { + res += char(ui); + } else if(ui <= 0x7ff) { + res += char(((ui>>6) & 0x1f) | 0xc0); + res += char((ui & 0x3f) | 0x80); + } else { + res += char(((ui >> 12) & 0x0f )| 0xe0); + res += char(((ui>>6) & 0x3f )| 0x80 ); + res += char((ui & 0x3f) | 0x80); + } + begin ++; + } +} + + +template +bool GBKTrans(const char* const str, size_t len, Uint16Container& vec) { + vec.clear(); + if(!str) { + return true; + } + size_t i = 0; + while(i < len) { + if(0 == (str[i] & 0x80)) { + vec.push_back(uint16_t(str[i])); + i++; + } else { + if(i + 1 < len) { //&& (str[i+1] & 0x80)) + uint16_t tmp = (((uint16_t(str[i]) & 0x00ff ) << 8) | (uint16_t(str[i+1]) & 0x00ff)); + vec.push_back(tmp); + i += 2; + } else { + return false; + } + } + } + return true; +} + +template +bool GBKTrans(const string& str, Uint16Container& vec) { + return GBKTrans(str.c_str(), str.size(), vec); +} + +template +void GBKTrans(Uint16ContainerConIter begin, Uint16ContainerConIter end, string& res) { + res.clear(); + //pair pa; + char first, second; + while(begin != end) { + //pa = uint16ToChar2(*begin); + first = ((*begin)>>8) & 0x00ff; + second = (*begin) & 0x00ff; + if(first & 0x80) { + res += first; + res += second; + } else { + res += second; + } + begin++; + } +} + +/* + * format example: "%Y-%m-%d %H:%M:%S" + */ +inline void GetTime(const string& format, string& timeStr) { + time_t timeNow; + time(&timeNow); + + struct tm tmNow; + + #if defined(_WIN32) || defined(_WIN64) + errno_t e = localtime_s(&tmNow, &timeNow); + assert(e = 0); + #else + struct tm * tm_tmp = localtime_r(&timeNow, &tmNow); + assert(tm_tmp != nullptr); + #endif + + timeStr.resize(64); + + size_t len = strftime((char*)timeStr.c_str(), timeStr.size(), format.c_str(), &tmNow); + + timeStr.resize(len); +} + +inline string PathJoin(const string& path1, const string& path2) { + if(EndsWith(path1, "/")) { + return path1 + path2; + } + return path1 + "/" + path2; +} + +} +#endif diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Thread.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Thread.hpp new file mode 100644 index 000000000..4e3c08442 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/Thread.hpp @@ -0,0 +1,44 @@ +#ifndef LIMONP_THREAD_HPP +#define LIMONP_THREAD_HPP + +#include "Logging.hpp" +#include "NonCopyable.hpp" + +namespace limonp { + +class IThread: NonCopyable { + public: + IThread(): isStarted(false), isJoined(false) { + } + virtual ~IThread() { + if(isStarted && !isJoined) { + XCHECK(!pthread_detach(thread_)); + } + }; + + virtual void Run() = 0; + void Start() { + XCHECK(!isStarted); + XCHECK(!pthread_create(&thread_, NULL, Worker, this)); + isStarted = true; + } + void Join() { + XCHECK(!isJoined); + XCHECK(!pthread_join(thread_, NULL)); + isJoined = true; + } + private: + static void * Worker(void * data) { + IThread * ptr = (IThread* ) data; + ptr->Run(); + return NULL; + } + + pthread_t thread_; + bool isStarted; + bool isJoined; +}; // class IThread + +} // namespace limonp + +#endif // LIMONP_THREAD_HPP diff --git a/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ThreadPool.hpp b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ThreadPool.hpp new file mode 100644 index 000000000..fb0ee57c4 --- /dev/null +++ b/funasr/runtime/onnxruntime/third_party/jieba/include/limonp/ThreadPool.hpp @@ -0,0 +1,86 @@ +#ifndef LIMONP_THREAD_POOL_HPP +#define LIMONP_THREAD_POOL_HPP + +#include "Thread.hpp" +#include "BlockingQueue.hpp" +#include "BoundedBlockingQueue.hpp" +#include "Closure.hpp" + +namespace limonp { + +using namespace std; + +//class ThreadPool; +class ThreadPool: NonCopyable { + public: + class Worker: public IThread { + public: + Worker(ThreadPool* pool): ptThreadPool_(pool) { + assert(ptThreadPool_); + } + virtual ~Worker() { + } + + virtual void Run() { + while (true) { + ClosureInterface* closure = ptThreadPool_->queue_.Pop(); + if (closure == NULL) { + break; + } + try { + closure->Run(); + } catch(std::exception& e) { + XLOG(ERROR) << e.what(); + } catch(...) { + XLOG(ERROR) << " unknown exception."; + } + delete closure; + } + } + private: + ThreadPool * ptThreadPool_; + }; // class Worker + + ThreadPool(size_t thread_num) + : threads_(thread_num), + queue_(thread_num) { + assert(thread_num); + for(size_t i = 0; i < threads_.size(); i ++) { + threads_[i] = new Worker(this); + } + } + ~ThreadPool() { + Stop(); + } + + void Start() { + for(size_t i = 0; i < threads_.size(); i++) { + threads_[i]->Start(); + } + } + void Stop() { + for(size_t i = 0; i < threads_.size(); i ++) { + queue_.Push(NULL); + } + for(size_t i = 0; i < threads_.size(); i ++) { + threads_[i]->Join(); + delete threads_[i]; + } + threads_.clear(); + } + + void Add(ClosureInterface* task) { + assert(task); + queue_.Push(task); + } + + private: + friend class Worker; + + vector threads_; + BoundedBlockingQueue queue_; +}; // class ThreadPool + +} // namespace limonp + +#endif // LIMONP_THREAD_POOL_HPP diff --git a/funasr/runtime/websocket/CMakeLists.txt b/funasr/runtime/websocket/CMakeLists.txt index 72911726b..56c425518 100644 --- a/funasr/runtime/websocket/CMakeLists.txt +++ b/funasr/runtime/websocket/CMakeLists.txt @@ -111,6 +111,8 @@ endif() include_directories(${PROJECT_SOURCE_DIR}/../onnxruntime/include/) include_directories(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/yaml-cpp/include/) include_directories(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/kaldi-native-fbank) +include_directories(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/jieba/include) +include_directories(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/jieba/include/limonp/include) add_subdirectory(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/yaml-cpp yaml-cpp) add_subdirectory(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/kaldi-native-fbank/kaldi-native-fbank/csrc csrc)