virtual bool cut(const string& str, vector<vector<string> >& vres) const { vres.clear(); Unicode unicode; unicode.reserve(str.size()); TransCode::decode(str, unicode); Unicode::const_iterator left = unicode.begin(); Unicode::const_iterator right; for(right = unicode.begin(); right != unicode.end(); right++) { if(isIn(specialSymbols_, *right)) { if(left != right) { cut(left, right, vres); } for (vector<vector<string> >::iterator itr = vres.begin(); itr != vres.end(); ++itr) { itr->resize(itr->size() + 1); TransCode::encode(right, right + 1, itr->back()); } left = right + 1; } } if(left != right) { cut(left, right, vres); } return true; }
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const { vector<Unicode> words; words.reserve(end - begin); if(!_mpSeg.cut(begin, end, words)) { Rcout<<"mpSeg cutDAG failed."<<std::endl; return false; } vector<Unicode> hmmRes; hmmRes.reserve(end - begin); Unicode piece; piece.reserve(end - begin); for (size_t i = 0, j = 0; i < words.size(); i++) { //if mp get a word, it's ok, put it into result if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0]))) { res.push_back(words[i]); continue; } // if mp get a single one and it is not in userdict, collect it in sequence j = i; while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0])) { piece.push_back(words[j][0]); j++; } // cut the sequence with hmm if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) { Rcout<<"_hmmSeg cut failed."<<std::endl; return false; } //put hmm result to result for (size_t k = 0; k < hmmRes.size(); k++) { res.push_back(hmmRes[k]); } //clear tmp vars piece.clear(); hmmRes.clear(); //let i jump over this piece i = j - 1; } return true; }
inline bool DecodeRunesInString(const char* s, size_t len, Unicode& unicode) { unicode.clear(); RuneStrArray runes; if (!DecodeRunesInString(s, len, runes)) { return false; } unicode.reserve(runes.size()); for (size_t i = 0; i < runes.size(); i++) { unicode.push_back(runes[i].rune); } return true; }
// compiler is expected to optimized this function to avoid return value copy inline Unicode Decode(const string& str) { Unicode unicode; unicode.reserve(str.size()); Decode(str, unicode); return unicode; }