示例#1
0
  bool Encoder::learn(const char *templfile,
                      const char *trainfile,
                      const char *modelfile,
                      bool textmodelfile,
                      size_t maxitr,
                      size_t freq,
                      double eta,
                      double C,
                      unsigned short thread_num,
                      unsigned short shrinking_size,
                      int algorithm) {
    std::cout << COPYRIGHT << std::endl;

    CHECK_FALSE(eta > 0.0) << "eta must be > 0.0";
    CHECK_FALSE(C >= 0.0) << "C must be >= 0.0";
    CHECK_FALSE(shrinking_size >= 1) << "shrinking-size must be >= 1";
    CHECK_FALSE(thread_num > 0) << "thread must be > 0";

#ifndef CRFPP_USE_THREAD
    CHECK_FALSE(thread_num == 1)
      << "This architecture doesn't support multi-thrading";
#endif

    CHECK_FALSE(algorithm == CRF_L2 || algorithm == CRF_L1 
				|| algorithm == MMAP ||
                (algorithm == MIRA && thread_num == 1))
                  <<  "MIRA doesn't support multi-thrading";

    EncoderFeatureIndex feature_index(thread_num);
    std::vector<TaggerImpl* > x;

    std::cout.setf(std::ios::fixed, std::ios::floatfield);
    std::cout.precision(5);
#undef WHAT_ERROR
#define WHAT_ERROR(msg) do { \
    for (std::vector<TaggerImpl *>::iterator it = x.begin(); \
         it != x.end(); ++it) \
      delete *it; \
    std::cerr << msg << std::endl; \
    return false; } while (0)

    CHECK_FALSE(feature_index.open(templfile, trainfile))
      << feature_index.what();

    {
      progress_timer pg;

      std::ifstream ifs(trainfile);
      CHECK_FALSE(ifs) << "cannot open: " << trainfile;

      std::cout << "reading training data: " << std::flush;
      size_t line = 0;
      while (ifs) {
        TaggerImpl *_x = new TaggerImpl();
        _x->open(&feature_index);
        _x->set_thread_id(line % thread_num);
        if (!_x->read(&ifs) || !_x->shrink())
          WHAT_ERROR(_x->what());

        if (!_x->empty())
          x.push_back(_x);
        else
          delete _x;

        if (++line % 100 == 0) std::cout << line << ".. " << std::flush;
      }

      ifs.close();
      std::cout << "\nDone!";
    }

    feature_index.shrink(freq);

    std::vector <double> alpha(feature_index.size());           // parameter
    std::fill(alpha.begin(), alpha.end(), 0.0);
    feature_index.set_alpha(&alpha[0]);

    std::cout << "Number of sentences: " << x.size() << std::endl;
    std::cout << "Number of features:  " << feature_index.size() << std::endl;
    std::cout << "Number of thread(s): " << thread_num << std::endl;
    std::cout << "Freq:                " << freq << std::endl;
    std::cout << "eta:                 " << eta << std::endl;
    std::cout << "C:                   " << C << std::endl;
    std::cout << "shrinking size:      " << shrinking_size
              << std::endl;

    progress_timer pg;

    switch (algorithm) {
    case MIRA:
      if (!runMIRA(x, &feature_index, &alpha[0],
                   maxitr, C, eta, shrinking_size, thread_num))
        WHAT_ERROR("MIRA execute error");
      break;
    case CRF_L2:
      if (!runCRF(x, &feature_index, &alpha[0],
                  maxitr, C, eta, shrinking_size, thread_num, false))
        WHAT_ERROR("CRF_L2 execute error");
      break;
    case CRF_L1:
      if (!runCRF(x, &feature_index, &alpha[0],
                  maxitr, C, eta, shrinking_size, thread_num, true))
        WHAT_ERROR("CRF_L1 execute error");
      break;
	case MMAP:
	  //save feature-list & corpus file to mem-mapping file for later crf-usage
	  ExportToMmap(x,feature_index,modelfile);
	  break;
    }

    for (std::vector<TaggerImpl *>::iterator it = x.begin();
         it != x.end(); ++it)
      delete *it;
	
	if(algorithm != MMAP) {
		if (!feature_index.save(modelfile, textmodelfile))
			WHAT_ERROR(feature_index.what());
	}
    std::cout << "\nDone!";

    return true;
  }
示例#2
0
bool Encoder::learn(const char *templfile,
                    const char *trainfile,
                    const char *modelfile,
                    bool textmodelfile,
                    size_t maxitr,
                    size_t freq,
                    double eta,
                    double C,
                    unsigned short thread_num,
                    unsigned short shrinking_size,
                    int algorithm) {
  std::cout << COPYRIGHT << std::endl;

  CHECK_FALSE(eta > 0.0) << "eta must be > 0.0";
  CHECK_FALSE(C >= 0.0) << "C must be >= 0.0";
  CHECK_FALSE(shrinking_size >= 1) << "shrinking-size must be >= 1";
  CHECK_FALSE(thread_num > 0) << "thread must be > 0";

#ifndef CRFPP_USE_THREAD
  CHECK_FALSE(thread_num == 1)
      << "This architecture doesn't support multi-thrading";
#endif

  if (algorithm == MIRA && thread_num > 1) {
    std::cerr <<  "MIRA doesn't support multi-thrading. use thread_num=1"
              << std::endl;
  }

  // feature template (uni-gram, bi-gram, tag)
  EncoderFeatureIndex feature_index;
  // info of strings, paths and nodes
  Allocator allocator(thread_num);
  // tagger collection
  std::vector<TaggerImpl* > x;

  std::cout.setf(std::ios::fixed, std::ios::floatfield);
  std::cout.precision(5);

#define WHAT_ERROR(msg) do {                                    \
    for (std::vector<TaggerImpl *>::iterator it = x.begin();    \
         it != x.end(); ++it)                                   \
      delete *it;                                               \
    std::cerr << msg << std::endl;                              \
    return false; } while (0)

  // get template format and feature state list
  CHECK_FALSE(feature_index.open(templfile, trainfile))
      << feature_index.what();

  {
    progress_timer pg;

    std::ifstream ifs(WPATH(trainfile));
    CHECK_FALSE(ifs) << "cannot open: " << trainfile;

    std::cout << "reading training data: " << std::flush;
    size_t line = 0;
    while (ifs) { // reading training file.
      TaggerImpl *_x = new TaggerImpl(); // read a complete sentence
      _x->open(&feature_index, &allocator);
      if (!_x->read(&ifs) || !_x->shrink()) {
        WHAT_ERROR(_x->what());
      }

      if (!_x->empty()) {
        x.push_back(_x);
      } else {
        delete _x;
        continue;
      }

      _x->set_thread_id(line % thread_num);

      if (++line % 100 == 0) {
        std::cout << line << ".. " << std::flush;
      }
    }

    ifs.close();
    std::cout << "\nDone!";
  }

  feature_index.shrink(freq, &allocator);

  std::vector <double> alpha(feature_index.size());           // parameter
  std::fill(alpha.begin(), alpha.end(), 0.0);
  feature_index.set_alpha(&alpha[0]);

  std::cout << "Number of sentences: " << x.size() << std::endl;
  std::cout << "Number of features:  " << feature_index.size() << std::endl;
  std::cout << "Number of thread(s): " << thread_num << std::endl;
  std::cout << "Freq:                " << freq << std::endl;
  std::cout << "eta:                 " << eta << std::endl;
  std::cout << "C:                   " << C << std::endl;
  std::cout << "shrinking size:      " << shrinking_size
            << std::endl;

  progress_timer pg;

  switch (algorithm) {
    case MIRA:
      if (!runMIRA(x, &feature_index, &alpha[0],
                   maxitr, C, eta, shrinking_size, thread_num)) {
        WHAT_ERROR("MIRA execute error");
      }
      break;
    case CRF_L2:
      if (!runCRF(x, &feature_index, &alpha[0],
                  maxitr, C, eta, shrinking_size, thread_num, false)) {
        WHAT_ERROR("CRF_L2 execute error");
      }
      break;
    case CRF_L1:
      if (!runCRF(x, &feature_index, &alpha[0],
                  maxitr, C, eta, shrinking_size, thread_num, true)) {
        WHAT_ERROR("CRF_L1 execute error");
      }
      break;
  }

  for (std::vector<TaggerImpl *>::iterator it = x.begin();
       it != x.end(); ++it) {
    delete *it;
  }

  if (!feature_index.save(modelfile, textmodelfile)) {
    WHAT_ERROR(feature_index.what());
  }

  std::cout << "\nDone!";

  return true;
}