TEST(DictTrieTest, Test1) { string s1, s2; DictTrie trie(DICT_FILE); ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001); string word("来到"); Unicode uni; ASSERT_TRUE(TransCode::Decode(word, uni)); DictUnit nodeInfo; nodeInfo.word = uni; nodeInfo.tag = "v"; nodeInfo.weight = -8.87033; s1 << nodeInfo; s2 << (*trie.Find(uni.begin(), uni.end())); EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2); word = "清华大学"; LocalVector<pair<size_t, const DictUnit*> > res; const char * words[] = {"清", "清华", "清华大学"}; for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { ASSERT_TRUE(TransCode::Decode(words[i], uni)); res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end()))); //resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end()); } vector<pair<size_t, const DictUnit*> > vec; vector<struct Dag> dags; ASSERT_TRUE(TransCode::Decode(word, uni)); trie.Find(uni.begin(), uni.end(), dags); ASSERT_EQ(dags.size(), uni.size()); ASSERT_NE(dags.size(), 0u); s1 << res; s2 << dags[0].nexts; ASSERT_EQ(s1, s2); }
bool tag(const string& src, vector<pair<string, string> >& res) { assert(_getInitFlag()); vector<string> cutRes; if (!_segment.cut(src, cutRes)) { LogError("_mixSegment cut failed"); return false; } const DictUnit *tmp = NULL; Unicode unico; for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) { if (!TransCode::decode(*itr, unico)) { LogError("decode failed."); return false; } tmp = _dictTrie.find(unico.begin(), unico.end()); res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag)); } tmp = NULL; return !res.empty(); }
bool tag(const string& src, vector<pair<string, string> >& res) const { vector<string> cutRes; if (!_segment.cut(src, cutRes)) { Rcout<<"_mixSegment cut failed"<<std::endl; return false; } const DictUnit *tmp = NULL; Unicode unico; for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) { if (!TransCode::decode(*itr, unico)) { Rcout<<"decode failed."<<std::endl; return false; } tmp = _dictTrie->find(unico.begin(), unico.end()); if(tmp == NULL || tmp->tag.empty()) { res.push_back(make_pair(*itr, _specialRule(unico))); } else { res.push_back(make_pair(*itr, tmp->tag)); } } return !res.empty(); }
bool tag(const string& src, vector<pair<string, string> >& res) const { vector<string> cutRes; if (!segment_.cut(src, cutRes)) { LogError("mixSegment_ cut failed"); return false; } const DictUnit *tmp = NULL; Unicode unico; const DictTrie * dict = segment_.getDictTrie(); assert(dict != NULL); for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) { if (!TransCode::decode(*itr, unico)) { LogError("decode failed."); return false; } tmp = dict->find(unico.begin(), unico.end()); if(tmp == NULL || tmp->tag.empty()) { res.push_back(make_pair(*itr, specialRule_(unico))); } else { res.push_back(make_pair(*itr, tmp->tag)); } } return !res.empty(); }
virtual bool cut(const string& str, vector<vector<string> >& vres) const { vres.clear(); Unicode unicode; unicode.reserve(str.size()); TransCode::decode(str, unicode); Unicode::const_iterator left = unicode.begin(); Unicode::const_iterator right; for(right = unicode.begin(); right != unicode.end(); right++) { if(isIn(specialSymbols_, *right)) { if(left != right) { cut(left, right, vres); } for (vector<vector<string> >::iterator itr = vres.begin(); itr != vres.end(); ++itr) { itr->resize(itr->size() + 1); TransCode::encode(right, right + 1, itr->back()); } left = right + 1; } } if(left != right) { cut(left, right, vres); } return true; }
TEST(DictTrieTest, automation) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); //string word = "yasherhs"; string word = "abcderf"; Unicode unicode; ASSERT_TRUE(TransCode::decode(word, unicode)); vector<struct SegmentChar> res; trie.find(unicode.begin(), unicode.end(), res); }
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const { vector<Unicode> words; words.reserve(end - begin); if(!_mpSeg.cut(begin, end, words)) { Rcout<<"mpSeg cutDAG failed."<<std::endl; return false; } vector<Unicode> hmmRes; hmmRes.reserve(end - begin); Unicode piece; piece.reserve(end - begin); for (size_t i = 0, j = 0; i < words.size(); i++) { //if mp get a word, it's ok, put it into result if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0]))) { res.push_back(words[i]); continue; } // if mp get a single one and it is not in userdict, collect it in sequence j = i; while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0])) { piece.push_back(words[j][0]); j++; } // cut the sequence with hmm if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) { Rcout<<"_hmmSeg cut failed."<<std::endl; return false; } //put hmm result to result for (size_t k = 0; k < hmmRes.size(); k++) { res.push_back(hmmRes[k]); } //clear tmp vars piece.clear(); hmmRes.clear(); //let i jump over this piece i = j - 1; } return true; }
TEST(DictTrieTest, UserDictWithMaxWeight) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax); string word = "云计算"; Unicode unicode; ASSERT_TRUE(TransCode::Decode(word, unicode)); const DictUnit * unit = trie.Find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit); string res ; res << *unit; ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -2.975", res); }
void InsertNode(const Unicode& key, const DictUnit* ptValue) { if (key.begin() == key.end()) { return; } TrieNode::NextMap::const_iterator kmIter; Unicode::const_iterator citer= key.begin(); TrieNode *ptNode = _base + (*(citer++)); for (; citer != key.end(); citer++) { if (NULL == ptNode->next) { ptNode->next = new TrieNode::NextMap; } kmIter = ptNode->next->find(*citer); if (ptNode->next->end() == kmIter) { TrieNode *nextNode = new TrieNode; (*(ptNode->next))[*citer] = nextNode; ptNode = nextNode; } else { ptNode = kmIter->second; } } ptNode->ptValue = ptValue; }
bool cut(const string& sentence, vector<string>& words, size_t max_word_len) const { Unicode unicode; if (!TransCode::decode(sentence, unicode)) { return false; } vector<Unicode> unicodeWords; cut(unicode.begin(), unicode.end(), unicodeWords, max_word_len); words.resize(unicodeWords.size()); for (size_t i = 0; i < words.size(); i++) { TransCode::encode(unicodeWords[i], words[i]); } return true; }
bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos)const { if(!_getInitFlag()) { LogError("not inited."); return false; } if(str.empty()) { return false; } Unicode sentence; if(!TransCode::decode(str, sentence)) { LogError("TransCode::decode failed."); return false; } return cut(sentence.begin(), sentence.end(), segWordInfos); }
bool Tag(const string& src, vector<pair<string, string> >& res) const { vector<string> CutRes; segment_.Cut(src, CutRes); const DictUnit *tmp = NULL; Unicode unico; const DictTrie * dict = segment_.GetDictTrie(); assert(dict != NULL); for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { if (!TransCode::Decode(*itr, unico)) { LOG(ERROR) << "Decode failed."; return false; } tmp = dict->Find(unico.begin(), unico.end()); if (tmp == NULL || tmp->tag.empty()) { res.push_back(make_pair(*itr, SpecialRule(unico))); } else { res.push_back(make_pair(*itr, tmp->tag)); } } return !res.empty(); }
virtual bool cut(const string& str, vector<string>& res)const { assert(_getInitFlag()); Unicode unicode; TransCode::decode(str, unicode); res.clear(); Unicode::const_iterator left = unicode.begin(); Unicode::const_iterator right = unicode.begin(); string oneword; while(right != unicode.end()) { if(isIn(_specialSymbols, *right)) { if(left != right) { cut(left, right, res); } TransCode::encode(right, right + 1, oneword); res.push_back(oneword); right ++; left = right; } else { right ++; } } if(left != right) { cut(left, right, res); } return true; }
inline bool encode(const Unicode& uni, string& res) { return encode(uni.begin(), uni.end(), res); }
inline void Encode(const Unicode& uni, string& res) { Encode(uni.begin(), uni.end(), res); }
TEST(DictTrieTest, Dag) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); { string word = "清华大学"; Unicode unicode; ASSERT_TRUE(TransCode::Decode(word, unicode)); vector<struct Dag> res; trie.Find(unicode.begin(), unicode.end(), res); size_t nexts_sizes[] = {3, 2, 2, 1}; ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0])); for (size_t i = 0; i < res.size(); i++) { ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); } } { string word = "北京邮电大学"; Unicode unicode; ASSERT_TRUE(TransCode::Decode(word, unicode)); vector<struct Dag> res; trie.Find(unicode.begin(), unicode.end(), res); size_t nexts_sizes[] = {3, 1, 2, 2, 2, 1}; ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0])); for (size_t i = 0; i < res.size(); i++) { ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); } } { string word = "长江大桥"; Unicode unicode; ASSERT_TRUE(TransCode::Decode(word, unicode)); vector<struct Dag> res; trie.Find(unicode.begin(), unicode.end(), res); size_t nexts_sizes[] = {3, 1, 2, 1}; ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0])); for (size_t i = 0; i < res.size(); i++) { ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); } } { string word = "长江大桥"; Unicode unicode; ASSERT_TRUE(TransCode::Decode(word, unicode)); vector<struct Dag> res; trie.Find(unicode.begin(), unicode.end(), res, 3); size_t nexts_sizes[] = {2, 1, 2, 1}; ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0])); for (size_t i = 0; i < res.size(); i++) { ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); } } { string word = "长江大桥"; Unicode unicode; ASSERT_TRUE(TransCode::Decode(word, unicode)); vector<struct Dag> res; trie.Find(unicode.begin(), unicode.end(), res, 4); size_t nexts_sizes[] = {3, 1, 2, 1}; ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0])); for (size_t i = 0; i < res.size(); i++) { ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); } } }
inline string Encode(const Unicode& unicode) { return Encode(unicode.begin(), unicode.end()); }
bool MixSegment::cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const { if(!_getInitFlag()) { LogError("not inited."); return false; } if(begin == end) { return false; } vector<TrieNodeInfo> infos; if(!_mpSeg.cut(begin, end, infos)) { LogError("mpSeg cutDAG failed."); return false; } Unicode unico; vector<Unicode> hmmRes; string tmp; for(uint i= 0; i < infos.size(); i++) { TransCode::encode(infos[i].word,tmp); if(1 == infos[i].word.size()) { unico.push_back(infos[i].word[0]); } else { if(!unico.empty()) { hmmRes.clear(); if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) { LogError("_hmmSeg cut failed."); return false; } for(uint j = 0; j < hmmRes.size(); j++) { TransCode::encode(hmmRes[j], tmp); res.push_back(tmp); } } unico.clear(); TransCode::encode(infos[i].word, tmp); res.push_back(tmp); } } if(!unico.empty()) { hmmRes.clear(); if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) { LogError("_hmmSeg cut failed."); return false; } for(uint j = 0; j < hmmRes.size(); j++) { TransCode::encode(hmmRes[j], tmp); res.push_back(tmp); } } return true; }