bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<TrieNodeInfo>& segWordInfos)const { if(!_getInitFlag()) { LogError("not inited."); return false; } SegmentContext segContext; //calc DAG if(!_calcDAG(begin, end, segContext)) { LogError("_calcDAG failed."); return false; } if(!_calcDP(segContext)) { LogError("_calcDP failed."); return false; } if(!_cut(segContext, segWordInfos)) { LogError("_cut failed."); return false; } return true; }
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const { if(!_getInitFlag()) { LogError("not inited."); return false; } vector<TrieNodeInfo> segWordInfos; if(!cut(begin, end, segWordInfos)) { return false; } string tmp; for(uint i = 0; i < segWordInfos.size(); i++) { if(TransCode::encode(segWordInfos[i].word, tmp)) { res.push_back(tmp); } else { LogError("encode failed."); } } return true; }
bool Trie::find(const Unicode& unico, vector<pair<uint, const TrieNodeInfo*> >& res)const { if(!_getInitFlag()) { LogFatal("trie not initted!"); return false; } TrieNode* p = _root; //for(Unicode::const_iterator it = begin; it != end; it++) for(uint i = 0; i < unico.size(); i++) { if(p->hmap.find(unico[i]) == p-> hmap.end()) { break; } p = p->hmap[unico[i]]; if(p->isLeaf) { uint pos = p->nodeInfoVecPos; if(pos < _nodeInfoVec.size()) { res.push_back(make_pair(i, &_nodeInfoVec[pos])); } else { LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); return false; } } } return !res.empty(); }
bool Trie::loadDict(const char * const filePath) { if(!_getInitFlag()) { LogError("not initted."); return false; } if(!checkFileExist(filePath)) { LogError("cann't find fiel[%s].",filePath); return false; } bool res = false; res = _trieInsert(filePath); if(!res) { LogError("_trieInsert failed."); return false; } res = _countWeight(); if(!res) { LogError("_countWeight failed."); return false; } return true; }
virtual bool dispose() { if(!_getInitFlag()) { return true; } _trie.dispose(); _setInitFlag(false); return true; }
bool MixSegment::dispose() { if(!_getInitFlag()) { return true; } _mpSeg.dispose(); _hmmSeg.dispose(); _setInitFlag(false); return true; }
bool Trie::insert(const TrieNodeInfo& nodeInfo) { if(!_getInitFlag()) { LogFatal("not initted!"); return false; } const Unicode& uintVec = nodeInfo.word; TrieNode* p = _root; for(uint i = 0; i < uintVec.size(); i++) { uint16_t cu = uintVec[i]; if(NULL == p) { return false; } if(p->hmap.end() == p->hmap.find(cu)) { TrieNode * next = NULL; try { next = new TrieNode; } catch(const bad_alloc& e) { return false; } p->hmap[cu] = next; p = next; } else { p = p->hmap[cu]; } } if(NULL == p) { return false; } if(p->isLeaf) { LogError("this node already inserted"); return false; } p->isLeaf = true; _nodeInfoVec.push_back(nodeInfo); p->nodeInfoVecPos = _nodeInfoVec.size() - 1; return true; }
const TrieNodeInfo* Trie::findPrefix(const string& str)const { if(!_getInitFlag()) { LogFatal("trie not initted!"); return NULL; } Unicode uintVec; if(!TransCode::decode(str, uintVec)) { LogError("TransCode::decode failed."); return NULL; } //find TrieNode* p = _root; uint pos = 0; uint16_t chUni = 0; const TrieNodeInfo * res = NULL; for(uint i = 0; i < uintVec.size(); i++) { chUni = uintVec[i]; if(p->isLeaf) { pos = p->nodeInfoVecPos; if(pos >= _nodeInfoVec.size()) { LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); return NULL; } res = &(_nodeInfoVec[pos]); } if(p->hmap.find(chUni) == p->hmap.end()) { break; } else { p = p->hmap[chUni]; } } return res; }
bool Trie::dispose() { if(!_getInitFlag()) { return false; } bool ret = _deleteNode(_root); if(!ret) { LogFatal("_deleteNode failed!"); return false; } _root = NULL; _nodeInfoVec.clear(); _setInitFlag(false); return ret; }
bool MixSegment::init(const char* const mpSegDict, const char* const hmmSegDict) { if(_getInitFlag()) { LogError("inited."); return false; } if(!_mpSeg.init(mpSegDict)) { LogError("_mpSeg init"); return false; } if(!_hmmSeg.init(hmmSegDict)) { LogError("_hmmSeg init"); return false; } return _setInitFlag(true); }
bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos)const { if(!_getInitFlag()) { LogError("not inited."); return false; } if(str.empty()) { return false; } Unicode sentence; if(!TransCode::decode(str, sentence)) { LogError("TransCode::decode failed."); return false; } return cut(sentence.begin(), sentence.end(), segWordInfos); }
virtual bool init() { if(_getInitFlag()) { LogError("already inited before now."); return false; } if(!_trie.init()) { LogError("_trie.init failed."); return false; } LogInfo("_trie.loadDict(%s) start...", _dictPath.c_str()); if(!_trie.loadDict(_dictPath.c_str())) { LogError("_trie.loadDict faield."); return false; } LogInfo("_trie.loadDict end."); return _setInitFlag(true); }
const TrieNodeInfo* Trie::find(Unicode::const_iterator begin, Unicode::const_iterator end)const { if(!_getInitFlag()) { LogFatal("trie not initted!"); return NULL; } if(begin >= end) { return NULL; } TrieNode* p = _root; for(Unicode::const_iterator it = begin; it != end; it++) { uint16_t chUni = *it; if(p->hmap.find(chUni) == p-> hmap.end()) { return NULL; } else { p = p->hmap[chUni]; } } if(p->isLeaf) { uint pos = p->nodeInfoVecPos; if(pos < _nodeInfoVec.size()) { return &(_nodeInfoVec[pos]); } else { LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); return NULL; } } return NULL; }
bool Trie::init() { if(_getInitFlag()) { LogError("already initted!"); return false; } try { _root = new TrieNode; } catch(const bad_alloc& e) { return false; } if(NULL == _root) { return false; } _setInitFlag(true); return true; }
operator bool() const {return _getInitFlag();};
bool MixSegment::cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const { if(!_getInitFlag()) { LogError("not inited."); return false; } if(begin == end) { return false; } vector<TrieNodeInfo> infos; if(!_mpSeg.cut(begin, end, infos)) { LogError("mpSeg cutDAG failed."); return false; } Unicode unico; vector<Unicode> hmmRes; string tmp; for(uint i= 0; i < infos.size(); i++) { TransCode::encode(infos[i].word,tmp); if(1 == infos[i].word.size()) { unico.push_back(infos[i].word[0]); } else { if(!unico.empty()) { hmmRes.clear(); if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) { LogError("_hmmSeg cut failed."); return false; } for(uint j = 0; j < hmmRes.size(); j++) { TransCode::encode(hmmRes[j], tmp); res.push_back(tmp); } } unico.clear(); TransCode::encode(infos[i].word, tmp); res.push_back(tmp); } } if(!unico.empty()) { hmmRes.clear(); if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) { LogError("_hmmSeg cut failed."); return false; } for(uint j = 0; j < hmmRes.size(); j++) { TransCode::encode(hmmRes[j], tmp); res.push_back(tmp); } } return true; }