int main(int argc, char ** argv) { if(argc < 2) { cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n" <<"options:\n" <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n" <<"\t--dictpath\tsee example\n" <<"\t--modelpath\tsee example\n" <<"example:\n" <<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8\n" <<"\t"<<argv[0]<<" testlines.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutHMM\n" <<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutMix\n" <<endl; return EXIT_FAILURE; } ArgvContext arg(argc, argv); string dictPath = arg["--dictpath"]; string modelPath = arg["--modelpath"]; string algorithm = arg["--algorithm"]; if("cutHMM" == algorithm) { HMMSegment seg; if(!seg.init(modelPath.c_str())) { cout<<"seg init failed."<<endl; return EXIT_FAILURE; } cut(&seg, arg[1].c_str()); seg.dispose(); } else if("cutMix" == algorithm) { MixSegment seg; if(!seg.init(dictPath.c_str(), modelPath.c_str())) { cout<<"seg init failed."<<endl; return EXIT_FAILURE; } cut(&seg, arg[1].c_str()); seg.dispose(); } else { MPSegment seg; if(!seg.init(dictPath.c_str())) { cout<<"seg init failed."<<endl; return false; } cut(&seg, arg[1].c_str()); seg.dispose(); } return EXIT_SUCCESS; }
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") { LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict)); LIMONP_CHECK(_hmmSeg.init(hmmSegDict)); // LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str()); return true; }
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const { vector<Unicode> words; words.reserve(end - begin); if(!_mpSeg.cut(begin, end, words)) { Rcout<<"mpSeg cutDAG failed."<<std::endl; return false; } vector<Unicode> hmmRes; hmmRes.reserve(end - begin); Unicode piece; piece.reserve(end - begin); for (size_t i = 0, j = 0; i < words.size(); i++) { //if mp get a word, it's ok, put it into result if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0]))) { res.push_back(words[i]); continue; } // if mp get a single one and it is not in userdict, collect it in sequence j = i; while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0])) { piece.push_back(words[j][0]); j++; } // cut the sequence with hmm if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) { Rcout<<"_hmmSeg cut failed."<<std::endl; return false; } //put hmm result to result for (size_t k = 0; k < hmmRes.size(); k++) { res.push_back(hmmRes[k]); } //clear tmp vars piece.clear(); hmmRes.clear(); //let i jump over this piece i = j - 1; } return true; }
bool dispose() { #ifndef NO_CODING_LOG if(!_getInitFlag()) { return true; } #endif _fullSeg.dispose(); _hmmSeg.dispose(); _setInitFlag(false); return true; }
int main(int argc, char ** argv) { //demo { HMMSegment seg; if(!seg.init("../dicts/hmm_model.utf8")) { cout<<"seg init failed."<<endl; return EXIT_FAILURE; } cut(&seg, "testlines.utf8"); seg.dispose(); } { MixSegment seg; if(!seg.init("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8")) { cout<<"seg init failed."<<endl; return EXIT_FAILURE; } cut(&seg, "testlines.utf8"); cout<<"Cut type"<<endl; cut_type(&seg, "testlines.utf8"); cout<<endl; seg.dispose(); } { MPSegment seg; if(!seg.init("../dicts/jieba.dict.utf8")) { cout<<"seg init failed."<<endl; return false; } cut(&seg, "testlines.utf8"); seg.dispose(); } return EXIT_SUCCESS; }
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") { assert(!_getInitFlag()); if(!_mpSeg.init(mpSegDict, userDict)) { LogError("_mpSeg init"); return false; } if(!_hmmSeg.init(hmmSegDict)) { LogError("_hmmSeg init"); return false; } LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str()); return _setInitFlag(true); }
bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const { #ifndef NO_CODING_LOG if (!_getInitFlag()) { LogError("not inited."); return false; } if (begin > end) { LogError("begin > end"); return false; } #endif //use hmm cut first vector<Unicode> hmmRes; if (!_hmmSeg.cut(begin, end, hmmRes)) { LogError("_hmmSeg cut failed."); return false; } vector<Unicode> fullRes; for (vector<Unicode>::const_iterator hmmResItr = hmmRes.begin(); hmmResItr != hmmRes.end(); hmmResItr++) { // if it's too long, cut with _fullSeg, put fullRes in res if (hmmResItr->size() > _maxWordLen) { if (_fullSeg.cut(hmmResItr->begin(), hmmResItr->end(), fullRes)) { for (vector<Unicode>::const_iterator fullResItr = fullRes.begin(); fullResItr != fullRes.end(); fullResItr++) { res.push_back(*fullResItr); } } } else // just use the hmm result { res.push_back(*hmmResItr); } } return true; }
bool init() { #ifndef NO_CODING_LOG if (_getInitFlag()) { LogError("inited."); } #endif if (!_hmmSeg.init()) { LogError("_hmmSeg init"); return false; } if (!_fullSeg.init()) { LogError("_fullSeg init"); return false; } return _setInitFlag(true); }