TEST(DictTrieTest, Test1) { string s1, s2; DictTrie trie(DICT_FILE); ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001); string word("来到"); Unicode uni; ASSERT_TRUE(TransCode::Decode(word, uni)); DictUnit nodeInfo; nodeInfo.word = uni; nodeInfo.tag = "v"; nodeInfo.weight = -8.87033; s1 << nodeInfo; s2 << (*trie.Find(uni.begin(), uni.end())); EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2); word = "清华大学"; LocalVector<pair<size_t, const DictUnit*> > res; const char * words[] = {"清", "清华", "清华大学"}; for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { ASSERT_TRUE(TransCode::Decode(words[i], uni)); res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end()))); //resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end()); } vector<pair<size_t, const DictUnit*> > vec; vector<struct Dag> dags; ASSERT_TRUE(TransCode::Decode(word, uni)); trie.Find(uni.begin(), uni.end(), dags); ASSERT_EQ(dags.size(), uni.size()); ASSERT_NE(dags.size(), 0u); s1 << res; s2 << dags[0].nexts; ASSERT_EQ(s1, s2); }
bool _loadEmitProb(const string &line, EmitProbMap &mp) { if (line.empty()) { return false; } vector<string> tmp, tmp2; Unicode unicode; split(line, tmp, ","); for (size_t i = 0; i < tmp.size(); i++) { split(tmp[i], tmp2, ":"); if (2 != tmp2.size()) { Rcout << "_emitProb illegal." << std::endl; return false; } if (!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) { Rcout << "TransCode failed." << std::endl; return false; } mp[unicode[0]] = atof(tmp2[1].c_str()); } return true; }
bool IsSingleWord(const string& str) const { Unicode unicode; TransCode::Decode(str, unicode); if (unicode.size() == 1) return true; return false; }
bool Trie::find(const Unicode& unico, vector<pair<uint, const TrieNodeInfo*> >& res)const { if(!_getInitFlag()) { LogFatal("trie not initted!"); return false; } TrieNode* p = _root; //for(Unicode::const_iterator it = begin; it != end; it++) for(uint i = 0; i < unico.size(); i++) { if(p->hmap.find(unico[i]) == p-> hmap.end()) { break; } p = p->hmap[unico[i]]; if(p->isLeaf) { uint pos = p->nodeInfoVecPos; if(pos < _nodeInfoVec.size()) { res.push_back(make_pair(i, &_nodeInfoVec[pos])); } else { LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); return false; } } } return !res.empty(); }
bool IsAllAscii(const Unicode& s) const { for(size_t i = 0; i < s.size(); i++) { if (s[i] >= 0x80) { return false; } } return true; }
const TrieNodeInfo* Trie::findPrefix(const string& str)const { if(!_getInitFlag()) { LogFatal("trie not initted!"); return NULL; } Unicode uintVec; if(!TransCode::decode(str, uintVec)) { LogError("TransCode::decode failed."); return NULL; } //find TrieNode* p = _root; uint pos = 0; uint16_t chUni = 0; const TrieNodeInfo * res = NULL; for(uint i = 0; i < uintVec.size(); i++) { chUni = uintVec[i]; if(p->isLeaf) { pos = p->nodeInfoVecPos; if(pos >= _nodeInfoVec.size()) { LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); return NULL; } res = &(_nodeInfoVec[pos]); } if(p->hmap.find(chUni) == p->hmap.end()) { break; } else { p = p->hmap[chUni]; } } return res; }
bool LoadEmitProb(const std::string& line, EmitProbMap& mp) { if (line.empty()) { return false; } std::vector<std::string> tmp, tmp2; Unicode unicode; limonp::Split(line, tmp, ","); for (size_t i = 0; i < tmp.size(); i++) { limonp::Split(tmp[i], tmp2, ":"); if (2 != tmp2.size()) { // limonp::LOG(ERROR) << "emitProb illegal."; return false; } if (!TransCode::Decode(tmp2[0], unicode) || unicode.size() != 1) { // limonp::LOG(ERROR) << "TransCode failed."; return false; } mp[unicode[0]] = atof(tmp2[1].c_str()); } return true; }