Ejemplo n.º 1
0
int cabocha_do(int argc, char **argv) {
  CaboCha::ParserImpl parser;
  CaboCha::Param param;

  param.open(argc, argv, long_options);

  if (!param.help_version()) {
    return EXIT_SUCCESS;
  }

  std::string ofilename = param.get<std::string>("output");
  if (ofilename.empty()) {
    ofilename = "-";
  }

  CaboCha::ostream_wrapper ofs(ofilename.c_str());
  if (!*ofs) {
    WHAT_ERROR("no such file or directory: " << ofilename);
  }

  if (!parser.open(&param)) {
    std::cout << parser.what() << std::endl;
    std::exit(EXIT_FAILURE);
  }

  const std::vector <std::string>& rest_ = param.rest_args();
  std::vector<std::string> rest = rest_;

  if (rest.empty()) {
    rest.push_back("-");
  }

  int input_layer = param.get<int>("input-layer");
  std::string input;

  for (size_t i = 0; i < rest.size(); ++i) {
    CaboCha::istream_wrapper ifs(rest[i].c_str());

    if (!*ifs) {
      WHAT_ERROR("no such file or directory: " << rest[i]);
    }

    while (true) {
      if (!CaboCha::read_sentence(ifs.get(), &input, input_layer)) {
        std::cerr << "too long line #line must be <= "
                  << CABOCHA_MAX_LINE_SIZE;
        return false;
      }

      if (ifs->eof() && input.empty()) {
        return false;
      }

      if (ifs->fail()) {
        std::cerr << "input-beffer overflow. "
                  << "The line is splitted. use -b #SIZE option."
                  << std::endl;
        ifs->clear();
      }

      const char *r = parser.parseToString(input.c_str(), input.size());
      if (!r) {
        WHAT_ERROR(parser.what());
      }
      *ofs << r << std::flush;
    }
  }

  return EXIT_SUCCESS;

#undef WHAT_ERROR
}
Ejemplo n.º 2
0
  bool Encoder::learn(const char *templfile,
                      const char *trainfile,
                      const char *modelfile,
                      bool textmodelfile,
                      size_t maxitr,
                      size_t freq,
                      double eta,
                      double C,
                      unsigned short thread_num,
                      unsigned short shrinking_size,
                      int algorithm) {
    std::cout << COPYRIGHT << std::endl;

    CHECK_FALSE(eta > 0.0) << "eta must be > 0.0";
    CHECK_FALSE(C >= 0.0) << "C must be >= 0.0";
    CHECK_FALSE(shrinking_size >= 1) << "shrinking-size must be >= 1";
    CHECK_FALSE(thread_num > 0) << "thread must be > 0";

#ifndef CRFPP_USE_THREAD
    CHECK_FALSE(thread_num == 1)
      << "This architecture doesn't support multi-thrading";
#endif

    CHECK_FALSE(algorithm == CRF_L2 || algorithm == CRF_L1 
				|| algorithm == MMAP ||
                (algorithm == MIRA && thread_num == 1))
                  <<  "MIRA doesn't support multi-thrading";

    EncoderFeatureIndex feature_index(thread_num);
    std::vector<TaggerImpl* > x;

    std::cout.setf(std::ios::fixed, std::ios::floatfield);
    std::cout.precision(5);
#undef WHAT_ERROR
#define WHAT_ERROR(msg) do { \
    for (std::vector<TaggerImpl *>::iterator it = x.begin(); \
         it != x.end(); ++it) \
      delete *it; \
    std::cerr << msg << std::endl; \
    return false; } while (0)

    CHECK_FALSE(feature_index.open(templfile, trainfile))
      << feature_index.what();

    {
      progress_timer pg;

      std::ifstream ifs(trainfile);
      CHECK_FALSE(ifs) << "cannot open: " << trainfile;

      std::cout << "reading training data: " << std::flush;
      size_t line = 0;
      while (ifs) {
        TaggerImpl *_x = new TaggerImpl();
        _x->open(&feature_index);
        _x->set_thread_id(line % thread_num);
        if (!_x->read(&ifs) || !_x->shrink())
          WHAT_ERROR(_x->what());

        if (!_x->empty())
          x.push_back(_x);
        else
          delete _x;

        if (++line % 100 == 0) std::cout << line << ".. " << std::flush;
      }

      ifs.close();
      std::cout << "\nDone!";
    }

    feature_index.shrink(freq);

    std::vector <double> alpha(feature_index.size());           // parameter
    std::fill(alpha.begin(), alpha.end(), 0.0);
    feature_index.set_alpha(&alpha[0]);

    std::cout << "Number of sentences: " << x.size() << std::endl;
    std::cout << "Number of features:  " << feature_index.size() << std::endl;
    std::cout << "Number of thread(s): " << thread_num << std::endl;
    std::cout << "Freq:                " << freq << std::endl;
    std::cout << "eta:                 " << eta << std::endl;
    std::cout << "C:                   " << C << std::endl;
    std::cout << "shrinking size:      " << shrinking_size
              << std::endl;

    progress_timer pg;

    switch (algorithm) {
    case MIRA:
      if (!runMIRA(x, &feature_index, &alpha[0],
                   maxitr, C, eta, shrinking_size, thread_num))
        WHAT_ERROR("MIRA execute error");
      break;
    case CRF_L2:
      if (!runCRF(x, &feature_index, &alpha[0],
                  maxitr, C, eta, shrinking_size, thread_num, false))
        WHAT_ERROR("CRF_L2 execute error");
      break;
    case CRF_L1:
      if (!runCRF(x, &feature_index, &alpha[0],
                  maxitr, C, eta, shrinking_size, thread_num, true))
        WHAT_ERROR("CRF_L1 execute error");
      break;
	case MMAP:
	  //save feature-list & corpus file to mem-mapping file for later crf-usage
	  ExportToMmap(x,feature_index,modelfile);
	  break;
    }

    for (std::vector<TaggerImpl *>::iterator it = x.begin();
         it != x.end(); ++it)
      delete *it;
	
	if(algorithm != MMAP) {
		if (!feature_index.save(modelfile, textmodelfile))
			WHAT_ERROR(feature_index.what());
	}
    std::cout << "\nDone!";

    return true;
  }
Ejemplo n.º 3
0
bool Encoder::learn(const char *templfile,
                    const char *trainfile,
                    const char *modelfile,
                    bool textmodelfile,
                    size_t maxitr,
                    size_t freq,
                    double eta,
                    double C,
                    unsigned short thread_num,
                    unsigned short shrinking_size,
                    int algorithm) {
  std::cout << COPYRIGHT << std::endl;

  CHECK_FALSE(eta > 0.0) << "eta must be > 0.0";
  CHECK_FALSE(C >= 0.0) << "C must be >= 0.0";
  CHECK_FALSE(shrinking_size >= 1) << "shrinking-size must be >= 1";
  CHECK_FALSE(thread_num > 0) << "thread must be > 0";

#ifndef CRFPP_USE_THREAD
  CHECK_FALSE(thread_num == 1)
      << "This architecture doesn't support multi-thrading";
#endif

  if (algorithm == MIRA && thread_num > 1) {
    std::cerr <<  "MIRA doesn't support multi-thrading. use thread_num=1"
              << std::endl;
  }

  // feature template (uni-gram, bi-gram, tag)
  EncoderFeatureIndex feature_index;
  // info of strings, paths and nodes
  Allocator allocator(thread_num);
  // tagger collection
  std::vector<TaggerImpl* > x;

  std::cout.setf(std::ios::fixed, std::ios::floatfield);
  std::cout.precision(5);

#define WHAT_ERROR(msg) do {                                    \
    for (std::vector<TaggerImpl *>::iterator it = x.begin();    \
         it != x.end(); ++it)                                   \
      delete *it;                                               \
    std::cerr << msg << std::endl;                              \
    return false; } while (0)

  // get template format and feature state list
  CHECK_FALSE(feature_index.open(templfile, trainfile))
      << feature_index.what();

  {
    progress_timer pg;

    std::ifstream ifs(WPATH(trainfile));
    CHECK_FALSE(ifs) << "cannot open: " << trainfile;

    std::cout << "reading training data: " << std::flush;
    size_t line = 0;
    while (ifs) { // reading training file.
      TaggerImpl *_x = new TaggerImpl(); // read a complete sentence
      _x->open(&feature_index, &allocator);
      if (!_x->read(&ifs) || !_x->shrink()) {
        WHAT_ERROR(_x->what());
      }

      if (!_x->empty()) {
        x.push_back(_x);
      } else {
        delete _x;
        continue;
      }

      _x->set_thread_id(line % thread_num);

      if (++line % 100 == 0) {
        std::cout << line << ".. " << std::flush;
      }
    }

    ifs.close();
    std::cout << "\nDone!";
  }

  feature_index.shrink(freq, &allocator);

  std::vector <double> alpha(feature_index.size());           // parameter
  std::fill(alpha.begin(), alpha.end(), 0.0);
  feature_index.set_alpha(&alpha[0]);

  std::cout << "Number of sentences: " << x.size() << std::endl;
  std::cout << "Number of features:  " << feature_index.size() << std::endl;
  std::cout << "Number of thread(s): " << thread_num << std::endl;
  std::cout << "Freq:                " << freq << std::endl;
  std::cout << "eta:                 " << eta << std::endl;
  std::cout << "C:                   " << C << std::endl;
  std::cout << "shrinking size:      " << shrinking_size
            << std::endl;

  progress_timer pg;

  switch (algorithm) {
    case MIRA:
      if (!runMIRA(x, &feature_index, &alpha[0],
                   maxitr, C, eta, shrinking_size, thread_num)) {
        WHAT_ERROR("MIRA execute error");
      }
      break;
    case CRF_L2:
      if (!runCRF(x, &feature_index, &alpha[0],
                  maxitr, C, eta, shrinking_size, thread_num, false)) {
        WHAT_ERROR("CRF_L2 execute error");
      }
      break;
    case CRF_L1:
      if (!runCRF(x, &feature_index, &alpha[0],
                  maxitr, C, eta, shrinking_size, thread_num, true)) {
        WHAT_ERROR("CRF_L1 execute error");
      }
      break;
  }

  for (std::vector<TaggerImpl *>::iterator it = x.begin();
       it != x.end(); ++it) {
    delete *it;
  }

  if (!feature_index.save(modelfile, textmodelfile)) {
    WHAT_ERROR(feature_index.what());
  }

  std::cout << "\nDone!";

  return true;
}
Ejemplo n.º 4
0
  bool Encoder::train(const char *templfile,
			const char *modelfile,
			bool textmodelfile,
			size_t maxitr,
			size_t freq,
			double eta,
			double C,
			unsigned short thread_num,
			unsigned short shrinking_size,
			int algorithm) {
		std::cout << COPYRIGHT << std::endl;

		CHECK_FALSE(eta > 0.0) << "eta must be > 0.0";
		CHECK_FALSE(C >= 0.0) << "C must be >= 0.0";
		CHECK_FALSE(shrinking_size >= 1) << "shrinking-size must be >= 1";
		CHECK_FALSE(thread_num > 0) << "thread must be > 0";

#ifndef CRFPP_USE_THREAD
		CHECK_FALSE(thread_num == 1)
			<< "This architecture doesn't support multi-thrading";
#endif

		CHECK_FALSE(algorithm == CRF_L2 || algorithm == CRF_L1 
			|| algorithm == MMAP ||
			(algorithm == MIRA && thread_num == 1))
			<<  "MIRA doesn't support multi-thrading";

#define WHAT_ERROR(msg) do { \
	delete[] x; \
	std::cerr << msg << std::endl; \
	return false; } while (0)

		MEncoderFeatureIndex feature_index(thread_num);
		MTaggerImpl* x = NULL;
		size_t s = sizeof(MTaggerImpl);

		std::cout.setf(std::ios::fixed, std::ios::floatfield);
		std::cout.precision(5);
		//load features

		char sufix_feature[] = ".features";
		char sufix_train[] = ".train";
		char buf[512];

		size_t model_filename_len = strlen(modelfile);
		memcpy(buf,modelfile, model_filename_len);
		memcpy(&buf[model_filename_len],sufix_feature,strlen(sufix_feature));
		buf[model_filename_len + strlen(sufix_feature) ] = 0;
		feature_index.open(buf,NULL);

		std::vector <double> alpha(feature_index.size());           // parameter
		std::fill(alpha.begin(), alpha.end(), 0.0);
		feature_index.set_alpha(&alpha[0]);
		
		//load pos-tagger
		Mmap <char> mmap_;
		memcpy(&buf[model_filename_len],sufix_train,strlen(sufix_train));
		buf[model_filename_len + strlen(sufix_train) ] = 0;

		CHECK_FALSE(mmap_.open(buf)) << mmap_.what();
		unsigned int x_len = 0;
		unsigned int max_len = 0;
		{
			unsigned short pos_len = 0;
			char *ptr = mmap_.begin();
			read_static<unsigned int>(&ptr, &x_len);
			read_static<unsigned int>(&ptr, &max_len);
			read_static<unsigned short>(&ptr, &pos_len); //the y-size()
			feature_index.set_ysize(pos_len);
			feature_index.set_max_tagger(max_len);
			x = new MTaggerImpl[x_len];
			//skip pos-list
			for(size_t i = 0; i<pos_len; i++) {
				unsigned short pos_id;
				unsigned short pos_str_len;
				read_static<unsigned short>(&ptr, &pos_id);
				read_static<unsigned short>(&ptr, &pos_str_len);
				ptr += pos_str_len; //skip the string data.
			}
			unsigned int* tagger_offset = (unsigned int*)(ptr);
			ptr += x_len * sizeof(unsigned int);
			//skip index
			for(size_t i = 0;i < x_len; i++) {
				char* cur = ptr+tagger_offset[i];
				x[i].open(&feature_index);
				x[i].import(cur);
				x[i].set_thread_id(i % thread_num);
			}
		}
			
		std::cout << "Number of sentences: " << x_len << std::endl;
		std::cout << "Number of features:  " << feature_index.size() << std::endl;
		std::cout << "Number of thread(s): " << thread_num << std::endl;
		std::cout << "Freq:                " << freq << std::endl;
		std::cout << "eta:                 " << eta << std::endl;
		std::cout << "C:                   " << C << std::endl;
		std::cout << "shrinking size:      " << shrinking_size<< std::endl;
		//begin estimator
		progress_timer pg;

		switch (algorithm) {
		case CRF_L2:
			if (!runCRF(x, x_len, &feature_index, &alpha[0],
				maxitr, C, eta, shrinking_size, thread_num, false))
				WHAT_ERROR("CRF_L2 execute error");
			break;
		case CRF_L1:
			if (!runCRF(x, x_len, &feature_index, &alpha[0],
				maxitr, C, eta, shrinking_size, thread_num, true))
				WHAT_ERROR("CRF_L1 execute error");
			break;
		}
		//clear 
		if(x)
			delete[] x;
		//save module
		mmap_.close();
		return true;
  }
Ejemplo n.º 5
0
int mecab_do(int argc, char **argv) {

    //debug
    std::cout << "[" << __LINE__ << "]: "
              << "mecab_do(int argc, char **argv)" << __FILE__ << std::endl;
    ///

#define WHAT_ERROR(msg) do {                    \
    std::cout << msg << std::endl;              \
    return EXIT_FAILURE; }                      \
  while (0);

//	//debug
//	std::cout << "[" << __LINE__ << "]: " << __FILE__ << std::endl;
//	WHAT_ERROR("ABCDE");
//	///

    MeCab::Param param;
    if (!param.open(argc, argv, MeCab::long_options)) {
        std::cout << param.what() << std::endl;
        return EXIT_FAILURE;
    }

    if (param.get<bool>("help")) {
        std::cout << param.help() << std::endl;
        return EXIT_SUCCESS;
    }

    if (param.get<bool>("version")) {
        std::cout << param.version() << std::endl;
        return EXIT_SUCCESS;
    }

    if (!load_dictionary_resource(&param)) {
        std::cout << param.what() << std::endl;
        return EXIT_SUCCESS;
    }

    if (param.get<int>("lattice-level") >= 1) {
        std::cerr << "lattice-level is DEPERCATED. "
                  << "use --marginal or --nbest." << std::endl;
    }

    //debug
    std::cout << "[" << __LINE__ << "]: " << __FILE__ << std::endl;
    int dbg = param.get<int>("lattice-level");

    std::cout << "	param.get<int>(\"lattice-level\") => " << dbg << std::endl;

    ///

    MeCab::scoped_ptr<MeCab::ModelImpl> model(new MeCab::ModelImpl);
    if (!model->open(param)) {
        std::cout << MeCab::getLastError() << std::endl;
        return EXIT_FAILURE;
    }

    std::string ofilename = param.get<std::string>("output");
    if (ofilename.empty()) {
        ofilename = "-";
    }

    const int nbest = param.get<int>("nbest");
    if (nbest <= 0 || nbest > NBEST_MAX) {
        WHAT_ERROR("invalid N value");
    }

    MeCab::ostream_wrapper ofs(ofilename.c_str());
    if (!*ofs) {
        WHAT_ERROR("no such file or directory: " << ofilename);
    }

    if (param.get<bool>("dump-config")) {
        param.dump_config(&*ofs);
        return EXIT_FAILURE;
    }

    if (param.get<bool>("dictionary-info")) {
        for (const MeCab::DictionaryInfo *d = model->dictionary_info();
                d; d = d->next) {
            *ofs << "filename:\t" << d->filename << std::endl;
            *ofs << "version:\t" << d->version << std::endl;
            *ofs << "charset:\t" << d->charset << std::endl;
            *ofs << "type:\t" << d->type   << std::endl;
            *ofs << "size:\t" << d->size << std::endl;
            *ofs << "left size:\t" << d->lsize << std::endl;
            *ofs << "right size:\t" << d->rsize << std::endl;
            *ofs << std::endl;
        }
        return EXIT_FAILURE;
    }

    const std::vector<std::string>& rest_ = param.rest_args();
    std::vector<std::string> rest = rest_;

    if (rest.empty()) {
        rest.push_back("-");
    }

    size_t ibufsize = std::min(MAX_INPUT_BUFFER_SIZE,
                               std::max(param.get<int>
                                        ("input-buffer-size"),
                                        MIN_INPUT_BUFFER_SIZE));

    const bool partial = param.get<bool>("partial");
    if (partial) {
        ibufsize *= 8;
    }

    MeCab::scoped_array<char> ibuf_data(new char[ibufsize]);
    char *ibuf = ibuf_data.get();

    MeCab::scoped_ptr<MeCab::Tagger> tagger(model->createTagger());

    if (!tagger.get()) {
        WHAT_ERROR("cannot create tagger");
    }

    for (size_t i = 0; i < rest.size(); ++i) {
        MeCab::istream_wrapper ifs(rest[i].c_str());
        if (!*ifs) {
            WHAT_ERROR("no such file or directory: " << rest[i]);
        }

        while (true) {
            if (!partial) {
                ifs->getline(ibuf, ibufsize);
            } else {
                std::string sentence;
                MeCab::scoped_fixed_array<char, BUF_SIZE> line;
                for (;;) {
                    if (!ifs->getline(line.get(), line.size())) {
                        ifs->clear(std::ios::eofbit|std::ios::badbit);
                        break;
                    }
                    sentence += line.get();
                    sentence += '\n';
                    if (std::strcmp(line.get(), "EOS") == 0 || line[0] == '\0') {
                        break;
                    }
                }
                std::strncpy(ibuf, sentence.c_str(), ibufsize);
            }
            if (ifs->eof() && !ibuf[0]) {
                return false;
            }
            if (ifs->fail()) {
                std::cerr << "input-buffer overflow. "
                          << "The line is split. use -b #SIZE option." << std::endl;
                ifs->clear();
            }
            const char *r = (nbest >= 2) ? tagger->parseNBest(nbest, ibuf) :
                            tagger->parse(ibuf);
            if (!r)  {
                WHAT_ERROR(tagger->what());
            }
            *ofs << r << std::flush;
        }
    }

    return EXIT_SUCCESS;

#undef WHAT_ERROR
}