int MP_test( SegmentWrapper *seg ) { Darts::DoubleArray::key_type sentence[1024]; string s_sentence; MPSegment mpSeg; vector<STWordInfo> words2; mpSeg.init( pDict ); while (std::cin.getline(sentence, sizeof(sentence))) { s_sentence = sentence; if ( s_sentence.size() < 1 ) continue; // cut the sentent to several words seg->cut2( s_sentence, words2 ); std::cout << "\tword num:" << words2.size() << endl; for ( size_t i=0; i<words2.size(); ++i) words2[i].print(); std::cout << "-----------"<< std::endl; words2.clear(); } return 0; }
int main(int argc, char ** argv) { if(argc < 2) { cout<<"usage: \n\t"<<argv[0]<<" [options] <filename>\n" <<"options:\n" <<"\t--algorithm\tSupported methods are [cutDAG, cutHMM, cutMix] for now. \n\t\t\tIf not specified, the default is cutDAG\n" <<"\t--dictpath\tsee example\n" <<"\t--modelpath\tsee example\n" <<"example:\n" <<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8\n" <<"\t"<<argv[0]<<" testlines.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutHMM\n" <<"\t"<<argv[0]<<" testlines.utf8 --dictpath dicts/jieba.dict.utf8 --modelpath dicts/hmm_model.utf8 --algorithm cutMix\n" <<endl; return EXIT_FAILURE; } ArgvContext arg(argc, argv); string dictPath = arg["--dictpath"]; string modelPath = arg["--modelpath"]; string algorithm = arg["--algorithm"]; if("cutHMM" == algorithm) { HMMSegment seg; if(!seg.init(modelPath.c_str())) { cout<<"seg init failed."<<endl; return EXIT_FAILURE; } cut(&seg, arg[1].c_str()); seg.dispose(); } else if("cutMix" == algorithm) { MixSegment seg; if(!seg.init(dictPath.c_str(), modelPath.c_str())) { cout<<"seg init failed."<<endl; return EXIT_FAILURE; } cut(&seg, arg[1].c_str()); seg.dispose(); } else { MPSegment seg; if(!seg.init(dictPath.c_str())) { cout<<"seg init failed."<<endl; return false; } cut(&seg, arg[1].c_str()); seg.dispose(); } return EXIT_SUCCESS; }
virtual bool cut(Unicode::const_iterator begin, Unicode::const_iterator end, vector<Unicode>& res) const { vector<Unicode> words; words.reserve(end - begin); if(!_mpSeg.cut(begin, end, words)) { Rcout<<"mpSeg cutDAG failed."<<std::endl; return false; } vector<Unicode> hmmRes; hmmRes.reserve(end - begin); Unicode piece; piece.reserve(end - begin); for (size_t i = 0, j = 0; i < words.size(); i++) { //if mp get a word, it's ok, put it into result if (1 != words[i].size() || (words[i].size() == 1 && _mpSeg.isUserDictSingleChineseWord(words[i][0]))) { res.push_back(words[i]); continue; } // if mp get a single one and it is not in userdict, collect it in sequence j = i; while (j < words.size() && 1 == words[j].size() && !_mpSeg.isUserDictSingleChineseWord(words[j][0])) { piece.push_back(words[j][0]); j++; } // cut the sequence with hmm if (!_hmmSeg.cut(piece.begin(), piece.end(), hmmRes)) { Rcout<<"_hmmSeg cut failed."<<std::endl; return false; } //put hmm result to result for (size_t k = 0; k < hmmRes.size(); k++) { res.push_back(hmmRes[k]); } //clear tmp vars piece.clear(); hmmRes.clear(); //let i jump over this piece i = j - 1; } return true; }
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") { LIMONP_CHECK(_mpSeg.init(mpSegDict, userDict)); LIMONP_CHECK(_hmmSeg.init(hmmSegDict)); // LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str()); return true; }
bool extract(const string& str, vector<pair<string, double> >& keywords, uint topN) const { vector<string> words; if(!_segment.cut(str, words)) { LogError("segment cut(%s) failed.", str.c_str()); return false; } unordered_map<string, double> wordmap; for(uint i = 0; i < words.size(); i ++) { wordmap[ words[i] ] += 1.0; } for(unordered_map<string, double>::iterator itr = wordmap.begin(); itr != wordmap.end();) { unordered_map<string, double>::const_iterator cit = _idfMap.find(itr->first); if(cit != _idfMap.end()) { itr->second *= cit->second; itr ++; } else { itr = wordmap.erase(itr); } } keywords.resize(MIN(topN, wordmap.size())); partial_sort_copy(wordmap.begin(), wordmap.end(), keywords.begin(), keywords.end(), _cmp); return true; }
bool init(const string& dictPath, const string& idfPath) { ifstream ifs(idfPath.c_str()); if(!ifs) { LogError("open %s failed.", idfPath.c_str()); return false; } string line ; vector<string> buf; for(uint lineno = 0; getline(ifs, line); lineno++) { buf.clear(); if(line.empty()) { LogError("line[%d] empty. skipped.", lineno); continue; } if(!split(line, buf, " ") || buf.size() != 2) { LogError("line %d [%s] illegal. skipped.", lineno, line.c_str()); continue; } _idfMap[buf[0]] = atof(buf[1].c_str()); } return _setInitFlag(_segment.init(dictPath)); };
int main(int argc, char ** argv) { //demo { HMMSegment seg; if(!seg.init("../dicts/hmm_model.utf8")) { cout<<"seg init failed."<<endl; return EXIT_FAILURE; } cut(&seg, "testlines.utf8"); seg.dispose(); } { MixSegment seg; if(!seg.init("../dicts/jieba.dict.utf8", "../dicts/hmm_model.utf8")) { cout<<"seg init failed."<<endl; return EXIT_FAILURE; } cut(&seg, "testlines.utf8"); cout<<"Cut type"<<endl; cut_type(&seg, "testlines.utf8"); cout<<endl; seg.dispose(); } { MPSegment seg; if(!seg.init("../dicts/jieba.dict.utf8")) { cout<<"seg init failed."<<endl; return false; } cut(&seg, "testlines.utf8"); seg.dispose(); } return EXIT_SUCCESS; }
bool init(const string& mpSegDict, const string& hmmSegDict, const string& userDict = "") { assert(!_getInitFlag()); if(!_mpSeg.init(mpSegDict, userDict)) { LogError("_mpSeg init"); return false; } if(!_hmmSeg.init(hmmSegDict)) { LogError("_hmmSeg init"); return false; } LogInfo("MixSegment init(%s, %s)", mpSegDict.c_str(), hmmSegDict.c_str()); return _setInitFlag(true); }
const DictTrie* getDictTrie() const { return _mpSeg.getDictTrie(); }