int main(int argc, char ** argv) { if(argc < 2) { cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n" <<"options:\n" <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n" <<"\t--dictpath\tsee example\n" <<"\t--modelpath\tsee example\n" <<"example:\n" <<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8\n" <<"\t"<<argv[0]<<" testlines.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutHMM\n" <<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutMix\n" <<endl; return EXIT_FAILURE; } ArgvContext arg(argc, argv); string dictPath = arg["--dictpath"]; string modelPath = arg["--modelpath"]; string algorithm = arg["--algorithm"]; if("cutHMM" == algorithm) { HMMSegment seg; if(!seg.init(modelPath.c_str())) { cout<<"seg init failed."<<endl; return EXIT_FAILURE; } cut(&seg, arg[1].c_str()); seg.dispose(); } else if("cutMix" == algorithm) { MixSegment seg; if(!seg.init(dictPath.c_str(), modelPath.c_str())) { cout<<"seg init failed."<<endl; return EXIT_FAILURE; } cut(&seg, arg[1].c_str()); seg.dispose(); } else { MPSegment seg; if(!seg.init(dictPath.c_str())) { cout<<"seg init failed."<<endl; return false; } cut(&seg, arg[1].c_str()); seg.dispose(); } return EXIT_SUCCESS; }
void init( const string& dictPath, const string& hmmFilePath, const string& userDictPath = "" ) { LIMONP_CHECK(_segment.init(dictPath, hmmFilePath, userDictPath)); _dictTrie = _segment.getDictTrie(); LIMONP_CHECK(_dictTrie); };
bool tag(const string& src, vector<pair<string, string> >& res) const { vector<string> cutRes; if (!_segment.cut(src, cutRes)) { Rcout<<"_mixSegment cut failed"<<std::endl; return false; } const DictUnit *tmp = NULL; Unicode unico; for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) { if (!TransCode::decode(*itr, unico)) { Rcout<<"decode failed."<<std::endl; return false; } tmp = _dictTrie->find(unico.begin(), unico.end()); if(tmp == NULL || tmp->tag.empty()) { res.push_back(make_pair(*itr, _specialRule(unico))); } else { res.push_back(make_pair(*itr, tmp->tag)); } } return !res.empty(); }
bool init(const string &dictPath, const string &hmmFilePath, const string &idfPath, const string &stopWordPath) { _loadIdfDict(idfPath); _loadStopWordDict(stopWordPath); LIMONP_CHECK(_segment.init(dictPath, hmmFilePath)); return true; };
bool tag(const string& src, vector<pair<string, string> >& res) { assert(_getInitFlag()); vector<string> cutRes; if (!_segment.cut(src, cutRes)) { LogError("_mixSegment cut failed"); return false; } const DictUnit *tmp = NULL; Unicode unico; for (vector<string>::iterator itr = cutRes.begin(); itr != cutRes.end(); ++itr) { if (!TransCode::decode(*itr, unico)) { LogError("decode failed."); return false; } tmp = _dictTrie.find(unico.begin(), unico.end()); res.push_back(make_pair(*itr, tmp == NULL ? "x" : tmp->tag)); } tmp = NULL; return !res.empty(); }
bool init(const string& dictPath, const string& hmmFilePath, const string& charStatus, const string& startProb, const string& emitProb, const string& endProb, const string& transProb) { assert(!_getInitFlag()); _dictTrie.init(dictPath); assert(_dictTrie); return _setInitFlag(_segment.init(dictPath, hmmFilePath)); };
bool extract(const string& str, vector<pair<string, double> >& keywords, size_t topN) const { vector<string> words; if(!_segment.cut(str, words)) { LogError("segment cut(%s) failed.", str.c_str()); return false; } // filtering single word. for(vector<string>::iterator iter = words.begin(); iter != words.end(); ) { if(_isSingleWord(*iter)) { iter = words.erase(iter); } else { iter++; } } map<string, double> wordmap; for(size_t i = 0; i < words.size(); i ++) { wordmap[ words[i] ] += 1.0; } for(map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end(); ) { if(_stopWords.end() != _stopWords.find(itr->first)) { wordmap.erase(itr++); continue; } unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first); if(cit != _idfMap.end()) { itr->second *= cit->second; } else { itr->second *= _idfAverage; } itr ++; } keywords.clear(); std::copy(wordmap.begin(), wordmap.end(), std::inserter(keywords, keywords.begin())); topN = MIN(topN, keywords.size()); partial_sort(keywords.begin(), keywords.begin() + topN, keywords.end(), _cmp); keywords.resize(topN); return true; }
int cut_c(const char* text, char **&result) { vector<string> words; segment.cut(text, words); int word_count = words.size(); result = (char**)(malloc(sizeof(char*)*word_count)); int i = 0; for (vector<string>::const_iterator j = words.begin(); j != words.end(); j++) { result[i++] = strdup((*j).c_str()); } return word_count; }
int main(int argc, char ** argv) { //demo { HMMSegment seg; if(!seg.init("../dicts/hmm_model.utf8")) { cout<<"seg init failed."<<endl; return EXIT_FAILURE; } cut(&seg, "testlines.utf8"); seg.dispose(); } { MixSegment seg; if(!seg.init("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8")) { cout<<"seg init failed."<<endl; return EXIT_FAILURE; } cut(&seg, "testlines.utf8"); cout<<"Cut type"<<endl; cut_type(&seg, "testlines.utf8"); cout<<endl; seg.dispose(); } { MPSegment seg; if(!seg.init("../dicts/jieba.dict.utf8")) { cout<<"seg init failed."<<endl; return false; } cut(&seg, "testlines.utf8"); seg.dispose(); } return EXIT_SUCCESS; }
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const { assert(_getInitFlag()); if (begin >= end) { LogError("begin >= end"); return false; } //use mix cut first vector<Unicode> mixRes; if (!_mixSeg.cut(begin, end, mixRes)) { LogError("_mixSeg cut failed."); return false; } vector<Unicode> fullRes; for (vector<Unicode>::const_iterator mixResItr = mixRes.begin(); mixResItr != mixRes.end(); mixResItr++) { // if it's too long, cut with _fullSeg, put fullRes in res if (mixResItr->size() > _maxWordLen) { if (_fullSeg.cut(mixResItr->begin(), mixResItr->end(), fullRes)) { for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { res.push_back(*fullResItr); } //clear tmp res fullRes.clear(); } } else // just use the mix result { res.push_back(*mixResItr); } } return true; }
bool init(const string& dict, const string& model, size_t maxWordLen) { if (_getInitFlag()) { LogError("inited already."); return false; } if (!_mixSeg.init(dict, model)) { LogError("_mixSeg init"); return false; } if (!_fullSeg.init(dict)) { LogError("_fullSeg init"); return false; } _maxWordLen = maxWordLen; return _setInitFlag(true); }
bool init_c(const char * dict_path, const char * model_path) { return segment.init(dict_path, model_path); }
bool init(const string& dictPath, const string& hmmFilePath, const string& idfPath, const string& stopWordPath) { _loadIdfDict(idfPath); _loadStopWordDict(stopWordPath); return _setInitFlag(_segment.init(dictPath, hmmFilePath)); };