示例#1
0
int crfpp_test(int argc, char **argv) {
  CRFPP::Param param;

  param.open(argc, argv, long_options);

  if (param.get<bool>("version")) {
    std::cout <<  param.version();
    return -1;
  }

  if (param.get<bool>("help")) {
    std::cout <<  param.help();
    return -1;
  }

  CRFPP::TaggerImpl tagger;
  if (!tagger.open(&param)) {
    std::cerr << tagger.what() << std::endl;
    return -1;
  }

  std::string output = param.get<std::string>("output");
  if (output.empty()) output = "-";
  CRFPP::ostream_wrapper os(output.c_str());
  if (!*os) {
    std::cerr << "no such file or directory: " << output << std::endl;
    return -1;
  }

  const std::vector<std::string>& rest_ = param.rest_args();
  std::vector<std::string> rest = rest_;  // trivial copy
  if (rest.empty()) rest.push_back("-");

  for (size_t i = 0; i < rest.size(); ++i) {
    CRFPP::istream_wrapper is(rest[i].c_str());
    if (!*is) {
      std::cerr << "no such file or directory: " << rest[i] << std::endl;
      return -1;
    }
    while (*is) tagger.parse_stream(is.get(), os.get());
  }

  return 0;
}
示例#2
0
int crfpp_learn(int argc, char **argv) {
  static const CRFPP::Option long_options[] = {
    {"freq",     'f', "1",      "INT",
     "use features that occuer no less than INT(default 1)" },
    {"maxiter" , 'm', "100000", "INT",
     "set INT for max iterations in LBFGS routine(default 10k)" },
    {"cost",     'c', "1.0",    "FLOAT",
     "set FLOAT for cost parameter(default 1.0)" },
    {"eta",      'e', "0.0001", "FLOAT",
     "set FLOAT for termination criterion(default 0.0001)" },
    {"convert",  'C',  0,       0,
     "convert text model to binary model" },
	{"mmap",  'M',  0,       0,
	"Use precompiled binary train file" },
    {"textmodel", 't', 0,       0,
     "build also text model file for debugging" },
    {"algorithm",  'a', "CRF",   "(CRF|MIRA)", "select training algorithm" },
    {"thread", 'p',   "1",       "INT",   "number of threads(default 1)" },
    {"shrinking-size", 'H', "20", "INT",
     "set INT for number of iterations variable needs to "
     " be optimal before considered for shrinking. (default 20)" },
    {"version",  'v', 0,        0,       "show the version and exit" },
    {"help",     'h', 0,        0,       "show this help and exit" },
    {0, 0, 0, 0, 0}
  };

  CRFPP::Param param;

  param.open(argc, argv, long_options);

  if (!param.help_version()) return 0;

  bool convert = param.get<bool>("convert");
  bool bmmap = param.get<bool>("mmap");

  const std::vector<std::string> &rest = param.rest_args();
  if (param.get<bool>("help") ||
      (convert && rest.size() != 2) || (!convert && rest.size() != 3)) {
    std::cout << param.help();
    return 0;
  }

  size_t         freq           = param.get<int>("freq");
  size_t         maxiter        = param.get<int>("maxiter");
  double         C              = param.get<float>("cost");
  double         eta            = param.get<float>("eta");
  bool           textmodel      = param.get<bool>("textmodel");
  unsigned short thread         = param.get<unsigned short>("thread");
  unsigned short shrinking_size = param.get<unsigned short>("shrinking-size");
  std::string salgo = param.get<std::string>("algorithm");

  toLower(&salgo);

  int algorithm = CRFPP::Encoder::MIRA;
  if (salgo == "crf" || salgo == "crf-l2") {
    algorithm = CRFPP::Encoder::CRF_L2;
  } else if (salgo == "crf-l1") {
    algorithm = CRFPP::Encoder::CRF_L1;
  } else if (salgo == "mira") {
    algorithm = CRFPP::Encoder::MIRA;
  } else if (salgo == "mmap") {
	  algorithm = CRFPP::Encoder::MMAP;
  }else{
    std::cerr << "unknown alogrithm: " << salgo << std::endl;
    return -1;
  }

  CRFPP::Encoder encoder;
  if (convert) {
    if (!encoder.convert(rest[0].c_str(), rest[1].c_str())) {
      std::cerr << encoder.what() << std::endl;
      return -1;
    }
  } else if(bmmap){
	if (!encoder.train(rest[0].c_str(),
		  rest[2].c_str(),
		  textmodel,
		  maxiter, freq, eta, C, thread, shrinking_size,
		  algorithm)) {
			  std::cerr << encoder.what() << std::endl;
			  return -1;
	}
  }else{
    if (!encoder.learn(rest[0].c_str(),
                       rest[1].c_str(),
                       rest[2].c_str(),
                       textmodel,
                       maxiter, freq, eta, C, thread, shrinking_size,
                       algorithm)) {
      std::cerr << encoder.what() << std::endl;
      return -1;
    }
  }

  return 0;
}