int crfpp_test(int argc, char **argv) { CRFPP::Param param; param.open(argc, argv, long_options); if (param.get<bool>("version")) { std::cout << param.version(); return -1; } if (param.get<bool>("help")) { std::cout << param.help(); return -1; } CRFPP::TaggerImpl tagger; if (!tagger.open(¶m)) { std::cerr << tagger.what() << std::endl; return -1; } std::string output = param.get<std::string>("output"); if (output.empty()) output = "-"; CRFPP::ostream_wrapper os(output.c_str()); if (!*os) { std::cerr << "no such file or directory: " << output << std::endl; return -1; } const std::vector<std::string>& rest_ = param.rest_args(); std::vector<std::string> rest = rest_; // trivial copy if (rest.empty()) rest.push_back("-"); for (size_t i = 0; i < rest.size(); ++i) { CRFPP::istream_wrapper is(rest[i].c_str()); if (!*is) { std::cerr << "no such file or directory: " << rest[i] << std::endl; return -1; } while (*is) tagger.parse_stream(is.get(), os.get()); } return 0; }
int crfpp_learn(int argc, char **argv) { static const CRFPP::Option long_options[] = { {"freq", 'f', "1", "INT", "use features that occuer no less than INT(default 1)" }, {"maxiter" , 'm', "100000", "INT", "set INT for max iterations in LBFGS routine(default 10k)" }, {"cost", 'c', "1.0", "FLOAT", "set FLOAT for cost parameter(default 1.0)" }, {"eta", 'e', "0.0001", "FLOAT", "set FLOAT for termination criterion(default 0.0001)" }, {"convert", 'C', 0, 0, "convert text model to binary model" }, {"mmap", 'M', 0, 0, "Use precompiled binary train file" }, {"textmodel", 't', 0, 0, "build also text model file for debugging" }, {"algorithm", 'a', "CRF", "(CRF|MIRA)", "select training algorithm" }, {"thread", 'p', "1", "INT", "number of threads(default 1)" }, {"shrinking-size", 'H', "20", "INT", "set INT for number of iterations variable needs to " " be optimal before considered for shrinking. (default 20)" }, {"version", 'v', 0, 0, "show the version and exit" }, {"help", 'h', 0, 0, "show this help and exit" }, {0, 0, 0, 0, 0} }; CRFPP::Param param; param.open(argc, argv, long_options); if (!param.help_version()) return 0; bool convert = param.get<bool>("convert"); bool bmmap = param.get<bool>("mmap"); const std::vector<std::string> &rest = param.rest_args(); if (param.get<bool>("help") || (convert && rest.size() != 2) || (!convert && rest.size() != 3)) { std::cout << param.help(); return 0; } size_t freq = param.get<int>("freq"); size_t maxiter = param.get<int>("maxiter"); double C = param.get<float>("cost"); double eta = param.get<float>("eta"); bool textmodel = param.get<bool>("textmodel"); unsigned short thread = param.get<unsigned short>("thread"); unsigned short shrinking_size = param.get<unsigned short>("shrinking-size"); std::string salgo = param.get<std::string>("algorithm"); toLower(&salgo); int algorithm = CRFPP::Encoder::MIRA; if (salgo == "crf" || salgo == "crf-l2") { algorithm = CRFPP::Encoder::CRF_L2; } else if (salgo == "crf-l1") { algorithm = CRFPP::Encoder::CRF_L1; } else if (salgo == "mira") { algorithm = CRFPP::Encoder::MIRA; } else if (salgo == "mmap") { algorithm = CRFPP::Encoder::MMAP; }else{ std::cerr << "unknown alogrithm: " << salgo << std::endl; return -1; } CRFPP::Encoder encoder; if (convert) { if (!encoder.convert(rest[0].c_str(), rest[1].c_str())) { std::cerr << encoder.what() << std::endl; return -1; } } else if(bmmap){ if (!encoder.train(rest[0].c_str(), rest[2].c_str(), textmodel, maxiter, freq, eta, C, thread, shrinking_size, algorithm)) { std::cerr << encoder.what() << std::endl; return -1; } }else{ if (!encoder.learn(rest[0].c_str(), rest[1].c_str(), rest[2].c_str(), textmodel, maxiter, freq, eta, C, thread, shrinking_size, algorithm)) { std::cerr << encoder.what() << std::endl; return -1; } } return 0; }