bool Encoder::train(const char *templfile, const char *modelfile, bool textmodelfile, size_t maxitr, size_t freq, double eta, double C, unsigned short thread_num, unsigned short shrinking_size, int algorithm) { std::cout << COPYRIGHT << std::endl; CHECK_FALSE(eta > 0.0) << "eta must be > 0.0"; CHECK_FALSE(C >= 0.0) << "C must be >= 0.0"; CHECK_FALSE(shrinking_size >= 1) << "shrinking-size must be >= 1"; CHECK_FALSE(thread_num > 0) << "thread must be > 0"; #ifndef CRFPP_USE_THREAD CHECK_FALSE(thread_num == 1) << "This architecture doesn't support multi-thrading"; #endif CHECK_FALSE(algorithm == CRF_L2 || algorithm == CRF_L1 || algorithm == MMAP || (algorithm == MIRA && thread_num == 1)) << "MIRA doesn't support multi-thrading"; #define WHAT_ERROR(msg) do { \ delete[] x; \ std::cerr << msg << std::endl; \ return false; } while (0) MEncoderFeatureIndex feature_index(thread_num); MTaggerImpl* x = NULL; size_t s = sizeof(MTaggerImpl); std::cout.setf(std::ios::fixed, std::ios::floatfield); std::cout.precision(5); //load features char sufix_feature[] = ".features"; char sufix_train[] = ".train"; char buf[512]; size_t model_filename_len = strlen(modelfile); memcpy(buf,modelfile, model_filename_len); memcpy(&buf[model_filename_len],sufix_feature,strlen(sufix_feature)); buf[model_filename_len + strlen(sufix_feature) ] = 0; feature_index.open(buf,NULL); std::vector <double> alpha(feature_index.size()); // parameter std::fill(alpha.begin(), alpha.end(), 0.0); feature_index.set_alpha(&alpha[0]); //load pos-tagger Mmap <char> mmap_; memcpy(&buf[model_filename_len],sufix_train,strlen(sufix_train)); buf[model_filename_len + strlen(sufix_train) ] = 0; CHECK_FALSE(mmap_.open(buf)) << mmap_.what(); unsigned int x_len = 0; unsigned int max_len = 0; { unsigned short pos_len = 0; char *ptr = mmap_.begin(); read_static<unsigned int>(&ptr, &x_len); read_static<unsigned int>(&ptr, &max_len); read_static<unsigned short>(&ptr, &pos_len); //the y-size() feature_index.set_ysize(pos_len); feature_index.set_max_tagger(max_len); x = new MTaggerImpl[x_len]; //skip pos-list for(size_t i = 0; i<pos_len; i++) { unsigned short pos_id; unsigned short pos_str_len; read_static<unsigned short>(&ptr, &pos_id); read_static<unsigned short>(&ptr, &pos_str_len); ptr += pos_str_len; //skip the string data. } unsigned int* tagger_offset = (unsigned int*)(ptr); ptr += x_len * sizeof(unsigned int); //skip index for(size_t i = 0;i < x_len; i++) { char* cur = ptr+tagger_offset[i]; x[i].open(&feature_index); x[i].import(cur); x[i].set_thread_id(i % thread_num); } } std::cout << "Number of sentences: " << x_len << std::endl; std::cout << "Number of features: " << feature_index.size() << std::endl; std::cout << "Number of thread(s): " << thread_num << std::endl; std::cout << "Freq: " << freq << std::endl; std::cout << "eta: " << eta << std::endl; std::cout << "C: " << C << std::endl; std::cout << "shrinking size: " << shrinking_size<< std::endl; //begin estimator progress_timer pg; switch (algorithm) { case CRF_L2: if (!runCRF(x, x_len, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num, false)) WHAT_ERROR("CRF_L2 execute error"); break; case CRF_L1: if (!runCRF(x, x_len, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num, true)) WHAT_ERROR("CRF_L1 execute error"); break; } //clear if(x) delete[] x; //save module mmap_.close(); return true; }
bool Encoder::learn(const char *templfile, const char *trainfile, const char *modelfile, bool textmodelfile, size_t maxitr, size_t freq, double eta, double C, unsigned short thread_num, unsigned short shrinking_size, int algorithm) { std::cout << COPYRIGHT << std::endl; CHECK_FALSE(eta > 0.0) << "eta must be > 0.0"; CHECK_FALSE(C >= 0.0) << "C must be >= 0.0"; CHECK_FALSE(shrinking_size >= 1) << "shrinking-size must be >= 1"; CHECK_FALSE(thread_num > 0) << "thread must be > 0"; #ifndef CRFPP_USE_THREAD CHECK_FALSE(thread_num == 1) << "This architecture doesn't support multi-thrading"; #endif CHECK_FALSE(algorithm == CRF_L2 || algorithm == CRF_L1 || algorithm == MMAP || (algorithm == MIRA && thread_num == 1)) << "MIRA doesn't support multi-thrading"; EncoderFeatureIndex feature_index(thread_num); std::vector<TaggerImpl* > x; std::cout.setf(std::ios::fixed, std::ios::floatfield); std::cout.precision(5); #undef WHAT_ERROR #define WHAT_ERROR(msg) do { \ for (std::vector<TaggerImpl *>::iterator it = x.begin(); \ it != x.end(); ++it) \ delete *it; \ std::cerr << msg << std::endl; \ return false; } while (0) CHECK_FALSE(feature_index.open(templfile, trainfile)) << feature_index.what(); { progress_timer pg; std::ifstream ifs(trainfile); CHECK_FALSE(ifs) << "cannot open: " << trainfile; std::cout << "reading training data: " << std::flush; size_t line = 0; while (ifs) { TaggerImpl *_x = new TaggerImpl(); _x->open(&feature_index); _x->set_thread_id(line % thread_num); if (!_x->read(&ifs) || !_x->shrink()) WHAT_ERROR(_x->what()); if (!_x->empty()) x.push_back(_x); else delete _x; if (++line % 100 == 0) std::cout << line << ".. " << std::flush; } ifs.close(); std::cout << "\nDone!"; } feature_index.shrink(freq); std::vector <double> alpha(feature_index.size()); // parameter std::fill(alpha.begin(), alpha.end(), 0.0); feature_index.set_alpha(&alpha[0]); std::cout << "Number of sentences: " << x.size() << std::endl; std::cout << "Number of features: " << feature_index.size() << std::endl; std::cout << "Number of thread(s): " << thread_num << std::endl; std::cout << "Freq: " << freq << std::endl; std::cout << "eta: " << eta << std::endl; std::cout << "C: " << C << std::endl; std::cout << "shrinking size: " << shrinking_size << std::endl; progress_timer pg; switch (algorithm) { case MIRA: if (!runMIRA(x, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num)) WHAT_ERROR("MIRA execute error"); break; case CRF_L2: if (!runCRF(x, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num, false)) WHAT_ERROR("CRF_L2 execute error"); break; case CRF_L1: if (!runCRF(x, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num, true)) WHAT_ERROR("CRF_L1 execute error"); break; case MMAP: //save feature-list & corpus file to mem-mapping file for later crf-usage ExportToMmap(x,feature_index,modelfile); break; } for (std::vector<TaggerImpl *>::iterator it = x.begin(); it != x.end(); ++it) delete *it; if(algorithm != MMAP) { if (!feature_index.save(modelfile, textmodelfile)) WHAT_ERROR(feature_index.what()); } std::cout << "\nDone!"; return true; }
bool Encoder::learn(const char *templfile, const char *trainfile, const char *modelfile, bool textmodelfile, size_t maxitr, size_t freq, double eta, double C, unsigned short thread_num, unsigned short shrinking_size, int algorithm) { std::cout << COPYRIGHT << std::endl; CHECK_FALSE(eta > 0.0) << "eta must be > 0.0"; CHECK_FALSE(C >= 0.0) << "C must be >= 0.0"; CHECK_FALSE(shrinking_size >= 1) << "shrinking-size must be >= 1"; CHECK_FALSE(thread_num > 0) << "thread must be > 0"; #ifndef CRFPP_USE_THREAD CHECK_FALSE(thread_num == 1) << "This architecture doesn't support multi-thrading"; #endif if (algorithm == MIRA && thread_num > 1) { std::cerr << "MIRA doesn't support multi-thrading. use thread_num=1" << std::endl; } // feature template (uni-gram, bi-gram, tag) EncoderFeatureIndex feature_index; // info of strings, paths and nodes Allocator allocator(thread_num); // tagger collection std::vector<TaggerImpl* > x; std::cout.setf(std::ios::fixed, std::ios::floatfield); std::cout.precision(5); #define WHAT_ERROR(msg) do { \ for (std::vector<TaggerImpl *>::iterator it = x.begin(); \ it != x.end(); ++it) \ delete *it; \ std::cerr << msg << std::endl; \ return false; } while (0) // get template format and feature state list CHECK_FALSE(feature_index.open(templfile, trainfile)) << feature_index.what(); { progress_timer pg; std::ifstream ifs(WPATH(trainfile)); CHECK_FALSE(ifs) << "cannot open: " << trainfile; std::cout << "reading training data: " << std::flush; size_t line = 0; while (ifs) { // reading training file. TaggerImpl *_x = new TaggerImpl(); // read a complete sentence _x->open(&feature_index, &allocator); if (!_x->read(&ifs) || !_x->shrink()) { WHAT_ERROR(_x->what()); } if (!_x->empty()) { x.push_back(_x); } else { delete _x; continue; } _x->set_thread_id(line % thread_num); if (++line % 100 == 0) { std::cout << line << ".. " << std::flush; } } ifs.close(); std::cout << "\nDone!"; } feature_index.shrink(freq, &allocator); std::vector <double> alpha(feature_index.size()); // parameter std::fill(alpha.begin(), alpha.end(), 0.0); feature_index.set_alpha(&alpha[0]); std::cout << "Number of sentences: " << x.size() << std::endl; std::cout << "Number of features: " << feature_index.size() << std::endl; std::cout << "Number of thread(s): " << thread_num << std::endl; std::cout << "Freq: " << freq << std::endl; std::cout << "eta: " << eta << std::endl; std::cout << "C: " << C << std::endl; std::cout << "shrinking size: " << shrinking_size << std::endl; progress_timer pg; switch (algorithm) { case MIRA: if (!runMIRA(x, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num)) { WHAT_ERROR("MIRA execute error"); } break; case CRF_L2: if (!runCRF(x, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num, false)) { WHAT_ERROR("CRF_L2 execute error"); } break; case CRF_L1: if (!runCRF(x, &feature_index, &alpha[0], maxitr, C, eta, shrinking_size, thread_num, true)) { WHAT_ERROR("CRF_L1 execute error"); } break; } for (std::vector<TaggerImpl *>::iterator it = x.begin(); it != x.end(); ++it) { delete *it; } if (!feature_index.save(modelfile, textmodelfile)) { WHAT_ERROR(feature_index.what()); } std::cout << "\nDone!"; return true; }