bool tag(const string& src, vector<pair<string, string> >& res) const { vector<string> cutRes; if (!_segment.cut(src, cutRes)) { Rcout<<"_mixSegment cut failed"<<std::endl; return false; } const DictUnit *tmp = NULL; Unicode unico; for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) { if (!TransCode::decode(*itr, unico)) { Rcout<<"decode failed."<<std::endl; return false; } tmp = _dictTrie->find(unico.begin(), unico.end()); if(tmp == NULL || tmp->tag.empty()) { res.push_back(make_pair(*itr, _specialRule(unico))); } else { res.push_back(make_pair(*itr, tmp->tag)); } } return !res.empty(); }
bool tag(const string& src, vector<pair<string, string> >& res) { assert(_getInitFlag()); vector<string> cutRes; if (!_segment.cut(src, cutRes)) { LogError("_mixSegment cut failed"); return false; } const DictUnit *tmp = NULL; Unicode unico; for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) { if (!TransCode::decode(*itr, unico)) { LogError("decode failed."); return false; } tmp = _dictTrie.find(unico.begin(), unico.end()); res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag)); } tmp = NULL; return !res.empty(); }
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const { vector<string> words; if(!_segment.cut(str, words)) { LogError("segment cut(%s) failed.", str.c_str()); return false; } // filtering single word. for(vector<string>::iterator iter = words.begin(); iter != words.end(); ) { if(_isSingleWord(*iter)) { iter = words.erase(iter); } else { iter++; } } map<string, double> wordmap; for(size_t i = 0; i < words.size(); i ++) { wordmap[ words[i] ] += 1.0; } for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) { if(_stopWords.end() != _stopWords.find(itr->first)) { wordmap.erase(itr++); continue; } unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first); if(cit != _idfMap.end()) { itr->second *= cit->second; } else { itr->second *= _idfAverage; } itr ++; } keywords.clear(); std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin())); topN = MIN(topN, keywords.size()); partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp); keywords.resize(topN); return true; }
int cut_c(const char* text, char **&result) { vector<string> words; segment.cut(text, words); int word_count = words.size(); result = (char**)(malloc(sizeof(char*)*word_count)); int i = 0; for (vector<string>::const_iterator j = words.begin(); j != words.end(); j++) { result[i++] = strdup((*j).c_str()); } return word_count; }
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const { assert(_getInitFlag()); if (begin >= end) { LogError("begin >= end"); return false; } //use mix cut first vector<Unicode> mixRes; if (!_mixSeg.cut(begin, end, mixRes)) { LogError("_mixSeg cut failed."); return false; } vector<Unicode> fullRes; for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { // if it's too long, cut with _fullSeg, put fullRes in res if (mixResItr->size() > _maxWordLen) { if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes)) { for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { res.push_back(*fullResItr); } //clear tmp res fullRes.clear(); } } else // just use the mix result { res.push_back(*mixResItr); } } return true; }