Example #1
0
  bool Encoder::train(const char *templfile,
			const char *modelfile,
			bool textmodelfile,
			size_t maxitr,
			size_t freq,
			double eta,
			double C,
			unsigned short thread_num,
			unsigned short shrinking_size,
			int algorithm) {
		std::cout << COPYRIGHT << std::endl;

		CHECK_FALSE(eta > 0.0) << "eta must be > 0.0";
		CHECK_FALSE(C >= 0.0) << "C must be >= 0.0";
		CHECK_FALSE(shrinking_size >= 1) << "shrinking-size must be >= 1";
		CHECK_FALSE(thread_num > 0) << "thread must be > 0";

#ifndef CRFPP_USE_THREAD
		CHECK_FALSE(thread_num == 1)
			<< "This architecture doesn't support multi-thrading";
#endif

		CHECK_FALSE(algorithm == CRF_L2 || algorithm == CRF_L1 
			|| algorithm == MMAP ||
			(algorithm == MIRA && thread_num == 1))
			<<  "MIRA doesn't support multi-thrading";

#define WHAT_ERROR(msg) do { \
	delete[] x; \
	std::cerr << msg << std::endl; \
	return false; } while (0)

		MEncoderFeatureIndex feature_index(thread_num);
		MTaggerImpl* x = NULL;
		size_t s = sizeof(MTaggerImpl);

		std::cout.setf(std::ios::fixed, std::ios::floatfield);
		std::cout.precision(5);
		//load features

		char sufix_feature[] = ".features";
		char sufix_train[] = ".train";
		char buf[512];

		size_t model_filename_len = strlen(modelfile);
		memcpy(buf,modelfile, model_filename_len);
		memcpy(&buf[model_filename_len],sufix_feature,strlen(sufix_feature));
		buf[model_filename_len + strlen(sufix_feature) ] = 0;
		feature_index.open(buf,NULL);

		std::vector <double> alpha(feature_index.size());           // parameter
		std::fill(alpha.begin(), alpha.end(), 0.0);
		feature_index.set_alpha(&alpha[0]);
		
		//load pos-tagger
		Mmap <char> mmap_;
		memcpy(&buf[model_filename_len],sufix_train,strlen(sufix_train));
		buf[model_filename_len + strlen(sufix_train) ] = 0;

		CHECK_FALSE(mmap_.open(buf)) << mmap_.what();
		unsigned int x_len = 0;
		unsigned int max_len = 0;
		{
			unsigned short pos_len = 0;
			char *ptr = mmap_.begin();
			read_static<unsigned int>(&ptr, &x_len);
			read_static<unsigned int>(&ptr, &max_len);
			read_static<unsigned short>(&ptr, &pos_len); //the y-size()
			feature_index.set_ysize(pos_len);
			feature_index.set_max_tagger(max_len);
			x = new MTaggerImpl[x_len];
			//skip pos-list
			for(size_t i = 0; i<pos_len; i++) {
				unsigned short pos_id;
				unsigned short pos_str_len;
				read_static<unsigned short>(&ptr, &pos_id);
				read_static<unsigned short>(&ptr, &pos_str_len);
				ptr += pos_str_len; //skip the string data.
			}
			unsigned int* tagger_offset = (unsigned int*)(ptr);
			ptr += x_len * sizeof(unsigned int);
			//skip index
			for(size_t i = 0;i < x_len; i++) {
				char* cur = ptr+tagger_offset[i];
				x[i].open(&feature_index);
				x[i].import(cur);
				x[i].set_thread_id(i % thread_num);
			}
		}
			
		std::cout << "Number of sentences: " << x_len << std::endl;
		std::cout << "Number of features:  " << feature_index.size() << std::endl;
		std::cout << "Number of thread(s): " << thread_num << std::endl;
		std::cout << "Freq:                " << freq << std::endl;
		std::cout << "eta:                 " << eta << std::endl;
		std::cout << "C:                   " << C << std::endl;
		std::cout << "shrinking size:      " << shrinking_size<< std::endl;
		//begin estimator
		progress_timer pg;

		switch (algorithm) {
		case CRF_L2:
			if (!runCRF(x, x_len, &feature_index, &alpha[0],
				maxitr, C, eta, shrinking_size, thread_num, false))
				WHAT_ERROR("CRF_L2 execute error");
			break;
		case CRF_L1:
			if (!runCRF(x, x_len, &feature_index, &alpha[0],
				maxitr, C, eta, shrinking_size, thread_num, true))
				WHAT_ERROR("CRF_L1 execute error");
			break;
		}
		//clear 
		if(x)
			delete[] x;
		//save module
		mmap_.close();
		return true;
  }
Example #2
0
  bool Encoder::learn(const char *templfile,
                      const char *trainfile,
                      const char *modelfile,
                      bool textmodelfile,
                      size_t maxitr,
                      size_t freq,
                      double eta,
                      double C,
                      unsigned short thread_num,
                      unsigned short shrinking_size,
                      int algorithm) {
    std::cout << COPYRIGHT << std::endl;

    CHECK_FALSE(eta > 0.0) << "eta must be > 0.0";
    CHECK_FALSE(C >= 0.0) << "C must be >= 0.0";
    CHECK_FALSE(shrinking_size >= 1) << "shrinking-size must be >= 1";
    CHECK_FALSE(thread_num > 0) << "thread must be > 0";

#ifndef CRFPP_USE_THREAD
    CHECK_FALSE(thread_num == 1)
      << "This architecture doesn't support multi-thrading";
#endif

    CHECK_FALSE(algorithm == CRF_L2 || algorithm == CRF_L1 
				|| algorithm == MMAP ||
                (algorithm == MIRA && thread_num == 1))
                  <<  "MIRA doesn't support multi-thrading";

    EncoderFeatureIndex feature_index(thread_num);
    std::vector<TaggerImpl* > x;

    std::cout.setf(std::ios::fixed, std::ios::floatfield);
    std::cout.precision(5);
#undef WHAT_ERROR
#define WHAT_ERROR(msg) do { \
    for (std::vector<TaggerImpl *>::iterator it = x.begin(); \
         it != x.end(); ++it) \
      delete *it; \
    std::cerr << msg << std::endl; \
    return false; } while (0)

    CHECK_FALSE(feature_index.open(templfile, trainfile))
      << feature_index.what();

    {
      progress_timer pg;

      std::ifstream ifs(trainfile);
      CHECK_FALSE(ifs) << "cannot open: " << trainfile;

      std::cout << "reading training data: " << std::flush;
      size_t line = 0;
      while (ifs) {
        TaggerImpl *_x = new TaggerImpl();
        _x->open(&feature_index);
        _x->set_thread_id(line % thread_num);
        if (!_x->read(&ifs) || !_x->shrink())
          WHAT_ERROR(_x->what());

        if (!_x->empty())
          x.push_back(_x);
        else
          delete _x;

        if (++line % 100 == 0) std::cout << line << ".. " << std::flush;
      }

      ifs.close();
      std::cout << "\nDone!";
    }

    feature_index.shrink(freq);

    std::vector <double> alpha(feature_index.size());           // parameter
    std::fill(alpha.begin(), alpha.end(), 0.0);
    feature_index.set_alpha(&alpha[0]);

    std::cout << "Number of sentences: " << x.size() << std::endl;
    std::cout << "Number of features:  " << feature_index.size() << std::endl;
    std::cout << "Number of thread(s): " << thread_num << std::endl;
    std::cout << "Freq:                " << freq << std::endl;
    std::cout << "eta:                 " << eta << std::endl;
    std::cout << "C:                   " << C << std::endl;
    std::cout << "shrinking size:      " << shrinking_size
              << std::endl;

    progress_timer pg;

    switch (algorithm) {
    case MIRA:
      if (!runMIRA(x, &feature_index, &alpha[0],
                   maxitr, C, eta, shrinking_size, thread_num))
        WHAT_ERROR("MIRA execute error");
      break;
    case CRF_L2:
      if (!runCRF(x, &feature_index, &alpha[0],
                  maxitr, C, eta, shrinking_size, thread_num, false))
        WHAT_ERROR("CRF_L2 execute error");
      break;
    case CRF_L1:
      if (!runCRF(x, &feature_index, &alpha[0],
                  maxitr, C, eta, shrinking_size, thread_num, true))
        WHAT_ERROR("CRF_L1 execute error");
      break;
	case MMAP:
	  //save feature-list & corpus file to mem-mapping file for later crf-usage
	  ExportToMmap(x,feature_index,modelfile);
	  break;
    }

    for (std::vector<TaggerImpl *>::iterator it = x.begin();
         it != x.end(); ++it)
      delete *it;
	
	if(algorithm != MMAP) {
		if (!feature_index.save(modelfile, textmodelfile))
			WHAT_ERROR(feature_index.what());
	}
    std::cout << "\nDone!";

    return true;
  }
Example #3
0
bool Encoder::learn(const char *templfile,
                    const char *trainfile,
                    const char *modelfile,
                    bool textmodelfile,
                    size_t maxitr,
                    size_t freq,
                    double eta,
                    double C,
                    unsigned short thread_num,
                    unsigned short shrinking_size,
                    int algorithm) {
  std::cout << COPYRIGHT << std::endl;

  CHECK_FALSE(eta > 0.0) << "eta must be > 0.0";
  CHECK_FALSE(C >= 0.0) << "C must be >= 0.0";
  CHECK_FALSE(shrinking_size >= 1) << "shrinking-size must be >= 1";
  CHECK_FALSE(thread_num > 0) << "thread must be > 0";

#ifndef CRFPP_USE_THREAD
  CHECK_FALSE(thread_num == 1)
      << "This architecture doesn't support multi-thrading";
#endif

  if (algorithm == MIRA && thread_num > 1) {
    std::cerr <<  "MIRA doesn't support multi-thrading. use thread_num=1"
              << std::endl;
  }

  // feature template (uni-gram, bi-gram, tag)
  EncoderFeatureIndex feature_index;
  // info of strings, paths and nodes
  Allocator allocator(thread_num);
  // tagger collection
  std::vector<TaggerImpl* > x;

  std::cout.setf(std::ios::fixed, std::ios::floatfield);
  std::cout.precision(5);

#define WHAT_ERROR(msg) do {                                    \
    for (std::vector<TaggerImpl *>::iterator it = x.begin();    \
         it != x.end(); ++it)                                   \
      delete *it;                                               \
    std::cerr << msg << std::endl;                              \
    return false; } while (0)

  // get template format and feature state list
  CHECK_FALSE(feature_index.open(templfile, trainfile))
      << feature_index.what();

  {
    progress_timer pg;

    std::ifstream ifs(WPATH(trainfile));
    CHECK_FALSE(ifs) << "cannot open: " << trainfile;

    std::cout << "reading training data: " << std::flush;
    size_t line = 0;
    while (ifs) { // reading training file.
      TaggerImpl *_x = new TaggerImpl(); // read a complete sentence
      _x->open(&feature_index, &allocator);
      if (!_x->read(&ifs) || !_x->shrink()) {
        WHAT_ERROR(_x->what());
      }

      if (!_x->empty()) {
        x.push_back(_x);
      } else {
        delete _x;
        continue;
      }

      _x->set_thread_id(line % thread_num);

      if (++line % 100 == 0) {
        std::cout << line << ".. " << std::flush;
      }
    }

    ifs.close();
    std::cout << "\nDone!";
  }

  feature_index.shrink(freq, &allocator);

  std::vector <double> alpha(feature_index.size());           // parameter
  std::fill(alpha.begin(), alpha.end(), 0.0);
  feature_index.set_alpha(&alpha[0]);

  std::cout << "Number of sentences: " << x.size() << std::endl;
  std::cout << "Number of features:  " << feature_index.size() << std::endl;
  std::cout << "Number of thread(s): " << thread_num << std::endl;
  std::cout << "Freq:                " << freq << std::endl;
  std::cout << "eta:                 " << eta << std::endl;
  std::cout << "C:                   " << C << std::endl;
  std::cout << "shrinking size:      " << shrinking_size
            << std::endl;

  progress_timer pg;

  switch (algorithm) {
    case MIRA:
      if (!runMIRA(x, &feature_index, &alpha[0],
                   maxitr, C, eta, shrinking_size, thread_num)) {
        WHAT_ERROR("MIRA execute error");
      }
      break;
    case CRF_L2:
      if (!runCRF(x, &feature_index, &alpha[0],
                  maxitr, C, eta, shrinking_size, thread_num, false)) {
        WHAT_ERROR("CRF_L2 execute error");
      }
      break;
    case CRF_L1:
      if (!runCRF(x, &feature_index, &alpha[0],
                  maxitr, C, eta, shrinking_size, thread_num, true)) {
        WHAT_ERROR("CRF_L1 execute error");
      }
      break;
  }

  for (std::vector<TaggerImpl *>::iterator it = x.begin();
       it != x.end(); ++it) {
    delete *it;
  }

  if (!feature_index.save(modelfile, textmodelfile)) {
    WHAT_ERROR(feature_index.what());
  }

  std::cout << "\nDone!";

  return true;
}