bool cut(Unicode::const_iterator begin , Unicode::const_iterator end, vector<Unicode>& res) const { if(end == begin) { return false; } assert(_getInitFlag()); vector<SegmentChar> segmentChars(end - begin); //calc DAG for(size_t i = 0; i < segmentChars.size(); i ++) { segmentChars[i].uniCh = *(begin + i); segmentChars[i].dag.clear(); _dictTrie.find(begin + i, end, segmentChars[i].dag, i); segmentChars[i].dag.insert(make_pair<DagType::key_type, DagType::mapped_type>(i, NULL)); } _calcDP(segmentChars); if(!_cut(segmentChars, res)) { LogError("_cut failed."); return false; } return true; }
bool tag(const string& src, vector<pair<string, string> >& res) const { vector<string> cutRes; if (!_segment.cut(src, cutRes)) { Rcout<<"_mixSegment cut failed"<<std::endl; return false; } const DictUnit *tmp = NULL; Unicode unico; for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) { if (!TransCode::decode(*itr, unico)) { Rcout<<"decode failed."<<std::endl; return false; } tmp = _dictTrie->find(unico.begin(), unico.end()); if(tmp == NULL || tmp->tag.empty()) { res.push_back(make_pair(*itr, _specialRule(unico))); } else { res.push_back(make_pair(*itr, tmp->tag)); } } return !res.empty(); }
bool tag(const string& src, vector<pair<string, string> >& res) { assert(_getInitFlag()); vector<string> cutRes; if (!_segment.cut(src, cutRes)) { LogError("_mixSegment cut failed"); return false; } const DictUnit *tmp = NULL; Unicode unico; for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) { if (!TransCode::decode(*itr, unico)) { LogError("decode failed."); return false; } tmp = _dictTrie.find(unico.begin(), unico.end()); res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag)); } tmp = NULL; return !res.empty(); }
TEST(DictTrieTest, Test1) { string s1, s2; DictTrie trie; trie.init(DICT_FILE); ASSERT_LT(trie.getMinWeight() + 15.6479, 0.001); string word("来到"); Unicode uni; ASSERT_TRUE(TransCode::decode(word, uni)); DictUnit nodeInfo; nodeInfo.word = uni; nodeInfo.tag = "v"; nodeInfo.weight = -8.87033; s1 << nodeInfo; s2 << (*trie.find(uni.begin(), uni.end())); EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2); word = "清华大学"; LocalVector<pair<size_t, const DictUnit*> > res; //vector<pair<size_t, const DictUnit* > resMap; LocalVector<pair<size_t, const DictUnit*> > res2; const char * words[] = {"清", "清华", "清华大学"}; for(size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { ASSERT_TRUE(TransCode::decode(words[i], uni)); res.push_back(make_pair(uni.size() - 1, trie.find(uni.begin(), uni.end()))); //resMap[uni.size() - 1] = trie.find(uni.begin(), uni.end()); } //DictUnit //res.push_back(make_pair(0, )) vector<pair<size_t, const DictUnit*> > vec; ASSERT_TRUE(TransCode::decode(word, uni)); ASSERT_TRUE(trie.find(uni.begin(), uni.end(), res2, 0)); s1 << res; s2 << res; ASSERT_EQ(s1, s2); }
bool _calcDAG(Unicode::const_iterator begin, Unicode::const_iterator end, vector<SegmentChar>& SegmentChars) const { SegmentChar schar; size_t offset; for(Unicode::const_iterator it = begin; it != end; it++) { schar.uniCh = *it; offset = it - begin; schar.dag.clear(); _dictTrie.find(it, end, schar.dag, offset); if(!isIn(schar.dag, offset)) { schar.dag[offset] = NULL; } SegmentChars.push_back(schar); } return true; }