bool Encoder::learn(const char *templfile, const char *trainfile, const char *modelfile, bool textmodelfile, size_t maxitr, size_t freq, double eta, double C, unsigned short thread_num, unsigned short shrinking_size, int algorithm) { std::cout << COPYRIGHT << std::endl; CHECK_FALSE(eta > 0.0) << "eta must be > 0.0"; CHECK_FALSE(C >= 0.0) << "C must be >= 0.0"; CHECK_FALSE(shrinking_size >= 1) << "shrinking-size must be >= 1"; CHECK_FALSE(thread_num > 0) << "thread must be > 0"; #ifndef CRFPP_USE_THREAD CHECK_FALSE(thread_num == 1) << "This architecture doesn't support multi-thrading"; #endif CHECK_FALSE(algorithm == CRF_L2 || algorithm == CRF_L1 || algorithm == MMAP || (algorithm == MIRA && thread_num == 1)) << "MIRA doesn't support multi-thrading"; EncoderFeatureIndex feature_index(thread_num); std::vector<TaggerImpl* > x; std::cout.setf(std::ios::fixed, std::ios::floatfield); std::cout.precision(5); #undef WHAT_ERROR #define WHAT_ERROR(msg) do { \ for (std::vector<TaggerImpl *>::iterator it = x.begin(); \ it != x.end(); ++it) \ delete *it; \ std::cerr << msg << std::endl; \ return false; } while (0) CHECK_FALSE(feature_index.open(templfile, trainfile)) << feature_index.what(); { progress_timer pg; std::ifstream ifs(trainfile); CHECK_FALSE(ifs) << "cannot open: " << trainfile; std::cout << "reading training data: " << std::flush; size_t line = 0; while (ifs) { TaggerImpl *_x = new TaggerImpl(); _x->open(&feature_index); _x->set_thread_id(line % thread_num); if (!_x->read(&ifs) || !_x->shrink()) WHAT_ERROR(_x->what()); if (!_x->empty()) x.push_back(_x); else delete _x; if (++line % 100 == 0) std::cout << line << ".. " << std::flush; } ifs.close(); std::cout << "\nDone!"; } feature_index.shrink(freq); std::vector <double> alpha(feature_index.size()); // parameter std::fill(alpha.begin(), alpha.end(), 0.0); feature_index.set_alpha(&alpha[0]); std::cout << "Number of sentences: " << x.size() << std::endl; std::cout << "Number of features: " << feature_index.size() << std::endl; std::cout << "Number of thread(s): " << thread_num << std::endl; std::cout << "Freq: " << freq << std::endl; std::cout << "eta: " << eta << std::endl; std::cout << "C: " << C << std::endl; std::cout << "shrinking size: " << shrinking_size << std::endl; progress_timer pg; switch (algorithm) { case MIRA: if (!runMIRA(x, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num)) WHAT_ERROR("MIRA execute error"); break; case CRF_L2: if (!runCRF(x, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num, false)) WHAT_ERROR("CRF_L2 execute error"); break; case CRF_L1: if (!runCRF(x, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num, true)) WHAT_ERROR("CRF_L1 execute error"); break; case MMAP: //save feature-list & corpus file to mem-mapping file for later crf-usage ExportToMmap(x,feature_index,modelfile); break; } for (std::vector<TaggerImpl *>::iterator it = x.begin(); it != x.end(); ++it) delete *it; if(algorithm != MMAP) { if (!feature_index.save(modelfile, textmodelfile)) WHAT_ERROR(feature_index.what()); } std::cout << "\nDone!"; return true; }
bool Encoder::learn(const char *templfile, const char *trainfile, const char *modelfile, bool textmodelfile, size_t maxitr, size_t freq, double eta, double C, unsigned short thread_num, unsigned short shrinking_size, int algorithm) { std::cout << COPYRIGHT << std::endl; CHECK_FALSE(eta > 0.0) << "eta must be > 0.0"; CHECK_FALSE(C >= 0.0) << "C must be >= 0.0"; CHECK_FALSE(shrinking_size >= 1) << "shrinking-size must be >= 1"; CHECK_FALSE(thread_num > 0) << "thread must be > 0"; #ifndef CRFPP_USE_THREAD CHECK_FALSE(thread_num == 1) << "This architecture doesn't support multi-thrading"; #endif if (algorithm == MIRA && thread_num > 1) { std::cerr << "MIRA doesn't support multi-thrading. use thread_num=1" << std::endl; } // feature template (uni-gram, bi-gram, tag) EncoderFeatureIndex feature_index; // info of strings, paths and nodes Allocator allocator(thread_num); // tagger collection std::vector<TaggerImpl* > x; std::cout.setf(std::ios::fixed, std::ios::floatfield); std::cout.precision(5); #define WHAT_ERROR(msg) do { \ for (std::vector<TaggerImpl *>::iterator it = x.begin(); \ it != x.end(); ++it) \ delete *it; \ std::cerr << msg << std::endl; \ return false; } while (0) // get template format and feature state list CHECK_FALSE(feature_index.open(templfile, trainfile)) << feature_index.what(); { progress_timer pg; std::ifstream ifs(WPATH(trainfile)); CHECK_FALSE(ifs) << "cannot open: " << trainfile; std::cout << "reading training data: " << std::flush; size_t line = 0; while (ifs) { // reading training file. TaggerImpl *_x = new TaggerImpl(); // read a complete sentence _x->open(&feature_index, &allocator); if (!_x->read(&ifs) || !_x->shrink()) { WHAT_ERROR(_x->what()); } if (!_x->empty()) { x.push_back(_x); } else { delete _x; continue; } _x->set_thread_id(line % thread_num); if (++line % 100 == 0) { std::cout << line << ".. " << std::flush; } } ifs.close(); std::cout << "\nDone!"; } feature_index.shrink(freq, &allocator); std::vector <double> alpha(feature_index.size()); // parameter std::fill(alpha.begin(), alpha.end(), 0.0); feature_index.set_alpha(&alpha[0]); std::cout << "Number of sentences: " << x.size() << std::endl; std::cout << "Number of features: " << feature_index.size() << std::endl; std::cout << "Number of thread(s): " << thread_num << std::endl; std::cout << "Freq: " << freq << std::endl; std::cout << "eta: " << eta << std::endl; std::cout << "C: " << C << std::endl; std::cout << "shrinking size: " << shrinking_size << std::endl; progress_timer pg; switch (algorithm) { case MIRA: if (!runMIRA(x, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num)) { WHAT_ERROR("MIRA execute error"); } break; case CRF_L2: if (!runCRF(x, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num, false)) { WHAT_ERROR("CRF_L2 execute error"); } break; case CRF_L1: if (!runCRF(x, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num, true)) { WHAT_ERROR("CRF_L1 execute error"); } break; } for (std::vector<TaggerImpl *>::iterator it = x.begin(); it != x.end(); ++it) { delete *it; } if (!feature_index.save(modelfile, textmodelfile)) { WHAT_ERROR(feature_index.what()); } std::cout << "\nDone!"; return true; }