bool tag(const string& src, vector<pair<string, string> >& res) const { vector<string> cutRes; if (!segment_.cut(src, cutRes)) { LogError("mixSegment_ cut failed"); return false; } const DictUnit *tmp = NULL; Unicode unico; const DictTrie * dict = segment_.getDictTrie(); assert(dict != NULL); for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) { if (!TransCode::decode(*itr, unico)) { LogError("decode failed."); return false; } tmp = dict->find(unico.begin(), unico.end()); if(tmp == NULL || tmp->tag.empty()) { res.push_back(make_pair(*itr, specialRule_(unico))); } else { res.push_back(make_pair(*itr, tmp->tag)); } } return !res.empty(); }
bool IsSingleWord(const string& str) const { Unicode unicode; TransCode::Decode(str, unicode); if (unicode.size() == 1) return true; return false; }
bool _loadEmitProb(const string &line, EmitProbMap &mp) { if (line.empty()) { return false; } vector<string> tmp, tmp2; Unicode unicode; split(line, tmp, ","); for (size_t i = 0; i < tmp.size(); i++) { split(tmp[i], tmp2, ":"); if (2 != tmp2.size()) { Rcout << "_emitProb illegal." << std::endl; return false; } if (!TransCode::decode(tmp2[0], unicode) || unicode.size() != 1) { Rcout << "TransCode failed." << std::endl; return false; } mp[unicode[0]] = atof(tmp2[1].c_str()); } return true; }
bool tag(const string& src, vector<pair<string, string> >& res) const { vector<string> cutRes; if (!_segment.cut(src, cutRes)) { Rcout<<"_mixSegment cut failed"<<std::endl; return false; } const DictUnit *tmp = NULL; Unicode unico; for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) { if (!TransCode::decode(*itr, unico)) { Rcout<<"decode failed."<<std::endl; return false; } tmp = _dictTrie->find(unico.begin(), unico.end()); if(tmp == NULL || tmp->tag.empty()) { res.push_back(make_pair(*itr, _specialRule(unico))); } else { res.push_back(make_pair(*itr, tmp->tag)); } } return !res.empty(); }
bool tag(const string& src, vector<pair<string, string> >& res) { assert(_getInitFlag()); vector<string> cutRes; if (!_segment.cut(src, cutRes)) { LogError("_mixSegment cut failed"); return false; } const DictUnit *tmp = NULL; Unicode unico; for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) { if (!TransCode::decode(*itr, unico)) { LogError("decode failed."); return false; } tmp = _dictTrie.find(unico.begin(), unico.end()); res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag)); } tmp = NULL; return !res.empty(); }
virtual bool cut(const string& str, vector<vector<string> >& vres) const { vres.clear(); Unicode unicode; unicode.reserve(str.size()); TransCode::decode(str, unicode); Unicode::const_iterator left = unicode.begin(); Unicode::const_iterator right; for(right = unicode.begin(); right != unicode.end(); right++) { if(isIn(specialSymbols_, *right)) { if(left != right) { cut(left, right, vres); } for (vector<vector<string> >::iterator itr = vres.begin(); itr != vres.end(); ++itr) { itr->resize(itr->size() + 1); TransCode::encode(right, right + 1, itr->back()); } left = right + 1; } } if(left != right) { cut(left, right, vres); } return true; }
TEST(DictTrieTest, automation) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); //string word = "yasherhs"; string word = "abcderf"; Unicode unicode; ASSERT_TRUE(TransCode::decode(word, unicode)); vector<struct SegmentChar> res; trie.find(unicode.begin(), unicode.end(), res); }
TEST(DictTrieTest, Test1) { string s1, s2; DictTrie trie(DICT_FILE); ASSERT_LT(trie.GetMinWeight() + 15.6479, 0.001); string word("来到"); Unicode uni; ASSERT_TRUE(TransCode::Decode(word, uni)); DictUnit nodeInfo; nodeInfo.word = uni; nodeInfo.tag = "v"; nodeInfo.weight = -8.87033; s1 << nodeInfo; s2 << (*trie.Find(uni.begin(), uni.end())); EXPECT_EQ("[\"26469\", \"21040\"] v -8.870", s2); word = "清华大学"; LocalVector<pair<size_t, const DictUnit*> > res; const char * words[] = {"清", "清华", "清华大学"}; for (size_t i = 0; i < sizeof(words)/sizeof(words[0]); i++) { ASSERT_TRUE(TransCode::Decode(words[i], uni)); res.push_back(make_pair(uni.size() - 1, trie.Find(uni.begin(), uni.end()))); //resMap[uni.size() - 1] = trie.Find(uni.begin(), uni.end()); } vector<pair<size_t, const DictUnit*> > vec; vector<struct Dag> dags; ASSERT_TRUE(TransCode::Decode(word, uni)); trie.Find(uni.begin(), uni.end(), dags); ASSERT_EQ(dags.size(), uni.size()); ASSERT_NE(dags.size(), 0u); s1 << res; s2 << dags[0].nexts; ASSERT_EQ(s1, s2); }
TEST(DictTrieTest, UserDictWithMaxWeight) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8", DictTrie::WordWeightMax); string word = "云计算"; Unicode unicode; ASSERT_TRUE(TransCode::Decode(word, unicode)); const DictUnit * unit = trie.Find(unicode.begin(), unicode.end()); ASSERT_TRUE(unit); string res ; res << *unit; ASSERT_EQ("[\"20113\", \"35745\", \"31639\"] -2.975", res); }
inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) { unicode.clear(); RuneStrArray runes; if (!DecodeRunesInString(s, len, runes)) { return false; } unicode.reserve(runes.size()); for (size_t i = 0; i < runes.size(); i++) { unicode.push_back(runes[i].rune); } return true; }
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const { assert(_getInitFlag()); vector<Unicode> words; if(!_mpSeg.cut(begin, end, words)) { LogError("mpSeg cutDAG failed."); return false; } vector<Unicode> hmmRes; Unicode piece; for (size_t i = 0, j = 0; i < words.size(); i++) { //if mp get a word, it's ok, put it into result if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0]))) { res.push_back(words[i]); continue; } // if mp get a single one and it is not in userdict, collect it in sequence j = i; while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0])) { piece.push_back(words[j][0]); j++; } // cut the sequence with hmm if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) { LogError("_hmmSeg cut failed."); return false; } //put hmm result to result for (size_t k = 0; k < hmmRes.size(); k++) { res.push_back(hmmRes[k]); } //clear tmp vars piece.clear(); hmmRes.clear(); //let i jump over this piece i = j - 1; } return true; }
bool Trie::find(const Unicode& unico, vector<pair<uint, const TrieNodeInfo*> >& res)const { if(!_getInitFlag()) { LogFatal("trie not initted!"); return false; } TrieNode* p = _root; //for(Unicode::const_iterator it = begin; it != end; it++) for(uint i = 0; i < unico.size(); i++) { if(p->hmap.find(unico[i]) == p-> hmap.end()) { break; } p = p->hmap[unico[i]]; if(p->isLeaf) { uint pos = p->nodeInfoVecPos; if(pos < _nodeInfoVec.size()) { res.push_back(make_pair(i, &_nodeInfoVec[pos])); } else { LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); return false; } } } return !res.empty(); }
bool cut(const string& sentence, vector<string>& words, size_t max_word_len) const { Unicode unicode; if (!TransCode::decode(sentence, unicode)) { return false; } vector<Unicode> unicodeWords; cut(unicode.begin(), unicode.end(), unicodeWords, max_word_len); words.resize(unicodeWords.size()); for (size_t i = 0; i < words.size(); i++) { TransCode::encode(unicodeWords[i], words[i]); } return true; }
bool IsAllAscii(const Unicode& s) const { for(size_t i = 0; i < s.size(); i++) { if (s[i] >= 0x80) { return false; } } return true; }
const TrieNodeInfo* Trie::findPrefix(const string& str)const { if(!_getInitFlag()) { LogFatal("trie not initted!"); return NULL; } Unicode uintVec; if(!TransCode::decode(str, uintVec)) { LogError("TransCode::decode failed."); return NULL; } //find TrieNode* p = _root; uint pos = 0; uint16_t chUni = 0; const TrieNodeInfo * res = NULL; for(uint i = 0; i < uintVec.size(); i++) { chUni = uintVec[i]; if(p->isLeaf) { pos = p->nodeInfoVecPos; if(pos >= _nodeInfoVec.size()) { LogFatal("node's nodeInfoVecPos is out of _nodeInfoVec's range"); return NULL; } res = &(_nodeInfoVec[pos]); } if(p->hmap.find(chUni) == p->hmap.end()) { break; } else { p = p->hmap[chUni]; } } return res; }
bool cut(const string& str, vector<TrieNodeInfo>& segWordInfos)const { if(!_getInitFlag()) { LogError("not inited."); return false; } if(str.empty()) { return false; } Unicode sentence; if(!TransCode::decode(str, sentence)) { LogError("TransCode::decode failed."); return false; } return cut(sentence.begin(), sentence.end(), segWordInfos); }
bool LoadEmitProb(const std::string& line, EmitProbMap& mp) { if (line.empty()) { return false; } std::vector<std::string> tmp, tmp2; Unicode unicode; limonp::Split(line, tmp, ","); for (size_t i = 0; i < tmp.size(); i++) { limonp::Split(tmp[i], tmp2, ":"); if (2 != tmp2.size()) { // limonp::LOG(ERROR) << "emitProb illegal."; return false; } if (!TransCode::Decode(tmp2[0], unicode) || unicode.size() != 1) { // limonp::LOG(ERROR) << "TransCode failed."; return false; } mp[unicode[0]] = atof(tmp2[1].c_str()); } return true; }
bool Tag(const string& src, vector<pair<string, string> >& res) const { vector<string> CutRes; segment_.Cut(src, CutRes); const DictUnit *tmp = NULL; Unicode unico; const DictTrie * dict = segment_.GetDictTrie(); assert(dict != NULL); for (vector<string>::iterator itr = CutRes.begin(); itr != CutRes.end(); ++itr) { if (!TransCode::Decode(*itr, unico)) { LOG(ERROR) << "Decode failed."; return false; } tmp = dict->Find(unico.begin(), unico.end()); if (tmp == NULL || tmp->tag.empty()) { res.push_back(make_pair(*itr, SpecialRule(unico))); } else { res.push_back(make_pair(*itr, tmp->tag)); } } return !res.empty(); }
inline Bytes encode(const Unicode& object) const { int state; size_t offset = 0; size_t enclen = 0; bytechar* encptr = NULL; size_t declen = object.length(); const unicode* decptr = static_cast<const unicode*>(object); state = u8_encode(decptr, declen, encptr, enclen, offset); if (state != UNICODE_STATE_SUCCESS) throw EncodeError(state, offset, "UTF-8"); return Bytes(encptr, enclen); }
virtual bool cut(const string& str, vector<string>& res)const { assert(_getInitFlag()); Unicode unicode; TransCode::decode(str, unicode); res.clear(); Unicode::const_iterator left = unicode.begin(); Unicode::const_iterator right = unicode.begin(); string oneword; while(right != unicode.end()) { if(isIn(_specialSymbols, *right)) { if(left != right) { cut(left, right, res); } TransCode::encode(right, right + 1, oneword); res.push_back(oneword); right ++; left = right; } else { right ++; } } if(left != right) { cut(left, right, res); } return true; }
void InsertNode(const Unicode& key, const DictUnit* ptValue) { if (key.begin() == key.end()) { return; } TrieNode::NextMap::const_iterator kmIter; Unicode::const_iterator citer= key.begin(); TrieNode *ptNode = _base + (*(citer++)); for (; citer != key.end(); citer++) { if (NULL == ptNode->next) { ptNode->next = new TrieNode::NextMap; } kmIter = ptNode->next->find(*citer); if (ptNode->next->end() == kmIter) { TrieNode *nextNode = new TrieNode; (*(ptNode->next))[*citer] = nextNode; ptNode = nextNode; } else { ptNode = kmIter->second; } } ptNode->ptValue = ptValue; }
inline Bytes encode(const Unicode& object) const { unicode code = 0; size_t offset = 0; bytechar* encptr = NULL; size_t len = object.length(); const unicode* decptr = object; encptr = new bytechar[len]; for (size_t i = 0; i < len; ++i) { code = this->uctobyte(decptr[i]); if (code == 0x110000) { delete[] decptr; throw EncodeError(UNICODE_STATE_ILLEGAL, offset, *this); } encptr[i] = static_cast<bytechar>(code); ++offset; } return Bytes(encptr, len); }
virtual void cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const { vector<Unicode> words; words.reserve(end - begin); mpSeg_.cut(begin, end, words); vector<Unicode> hmmRes; hmmRes.reserve(end - begin); Unicode piece; piece.reserve(end - begin); for (size_t i = 0, j = 0; i < words.size(); i++) { //if mp get a word, it's ok, put it into result if (1 != words[i].size() || (words[i].size() == 1 && mpSeg_.isUserDictSingleChineseWord(words[i][0]))) { res.push_back(words[i]); continue; } // if mp get a single one and it is not in userdict, collect it in sequence j = i; while (j < words.size() && 1 == words[j].size() && !mpSeg_.isUserDictSingleChineseWord(words[j][0])) { piece.push_back(words[j][0]); j++; } // cut the sequence with hmm hmmSeg_.cut(piece.begin(), piece.end(), hmmRes); //put hmm result to result for (size_t k = 0; k < hmmRes.size(); k++) { res.push_back(hmmRes[k]); } //clear tmp vars piece.clear(); hmmRes.clear(); //let i jump over this piece i = j - 1; } }
inline void Encode(const Unicode& uni, string& res) { Encode(uni.begin(), uni.end(), res); }
bool MixSegment::cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<string>& res)const { if(!_getInitFlag()) { LogError("not inited."); return false; } if(begin == end) { return false; } vector<TrieNodeInfo> infos; if(!_mpSeg.cut(begin, end, infos)) { LogError("mpSeg cutDAG failed."); return false; } Unicode unico; vector<Unicode> hmmRes; string tmp; for(uint i= 0; i < infos.size(); i++) { TransCode::encode(infos[i].word,tmp); if(1 == infos[i].word.size()) { unico.push_back(infos[i].word[0]); } else { if(!unico.empty()) { hmmRes.clear(); if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) { LogError("_hmmSeg cut failed."); return false; } for(uint j = 0; j < hmmRes.size(); j++) { TransCode::encode(hmmRes[j], tmp); res.push_back(tmp); } } unico.clear(); TransCode::encode(infos[i].word, tmp); res.push_back(tmp); } } if(!unico.empty()) { hmmRes.clear(); if(!_hmmSeg.cut(unico.begin(), unico.end(), hmmRes)) { LogError("_hmmSeg cut failed."); return false; } for(uint j = 0; j < hmmRes.size(); j++) { TransCode::encode(hmmRes[j], tmp); res.push_back(tmp); } } return true; }
TEST(DictTrieTest, Dag) { DictTrie trie(DICT_FILE, "../test/testdata/userdict.utf8"); { string word = "清华大学"; Unicode unicode; ASSERT_TRUE(TransCode::Decode(word, unicode)); vector<struct Dag> res; trie.Find(unicode.begin(), unicode.end(), res); size_t nexts_sizes[] = {3, 2, 2, 1}; ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0])); for (size_t i = 0; i < res.size(); i++) { ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); } } { string word = "北京邮电大学"; Unicode unicode; ASSERT_TRUE(TransCode::Decode(word, unicode)); vector<struct Dag> res; trie.Find(unicode.begin(), unicode.end(), res); size_t nexts_sizes[] = {3, 1, 2, 2, 2, 1}; ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0])); for (size_t i = 0; i < res.size(); i++) { ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); } } { string word = "长江大桥"; Unicode unicode; ASSERT_TRUE(TransCode::Decode(word, unicode)); vector<struct Dag> res; trie.Find(unicode.begin(), unicode.end(), res); size_t nexts_sizes[] = {3, 1, 2, 1}; ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0])); for (size_t i = 0; i < res.size(); i++) { ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); } } { string word = "长江大桥"; Unicode unicode; ASSERT_TRUE(TransCode::Decode(word, unicode)); vector<struct Dag> res; trie.Find(unicode.begin(), unicode.end(), res, 3); size_t nexts_sizes[] = {2, 1, 2, 1}; ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0])); for (size_t i = 0; i < res.size(); i++) { ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); } } { string word = "长江大桥"; Unicode unicode; ASSERT_TRUE(TransCode::Decode(word, unicode)); vector<struct Dag> res; trie.Find(unicode.begin(), unicode.end(), res, 4); size_t nexts_sizes[] = {3, 1, 2, 1}; ASSERT_EQ(res.size(), sizeof(nexts_sizes)/sizeof(nexts_sizes[0])); for (size_t i = 0; i < res.size(); i++) { ASSERT_EQ(res[i].nexts.size(), nexts_sizes[i]); } } }
inline string Encode(const Unicode& unicode) { return Encode(unicode.begin(), unicode.end()); }
// compiler is expected to optimized this function to avoid return value copy inline Unicode Decode(const string& str) { Unicode unicode; unicode.reserve(str.size()); Decode(str, unicode); return unicode; }
inline bool encode(const Unicode& uni, string& res) { return encode(uni.begin(), uni.end(), res); }