Exemple #1
0
bool load_dictionary_resource(Param *param) {

	//debug
	std::cout << "[" << __FILE__ << ":" << __LINE__ << "]: "
			<< "load_dictionary_resource(Param *param)" << std::endl;

	///

  std::string rcfile = param->get<std::string>("rcfile");

#ifdef HAVE_GETENV
  if (rcfile.empty()) {

	  //debug
	std::cout << "[" << __FILE__ << ":" << __LINE__ << "]: "
			<< "rcfile.empty()" << std::endl;

	///

    const char *homedir = getenv("HOME");
    if (homedir) {
      const std::string s = MeCab::create_filename(std::string(homedir),
                                                   ".mecabrc");
      std::ifstream ifs(WPATH(s.c_str()));
      if (ifs) {
        rcfile = s;
      }
    }
  }

  if (rcfile.empty()) {
    const char *rcenv = getenv("MECABRC");
    if (rcenv) {
      rcfile = rcenv;
    }
  }
#endif

#if defined (HAVE_GETENV) && defined(_WIN32) && !defined(__CYGWIN__)
  if (rcfile.empty()) {
    scoped_fixed_array<wchar_t, BUF_SIZE> buf;
    const DWORD len = ::GetEnvironmentVariableW(L"MECABRC",
                                                buf.get(),
                                                buf.size());
    if (len < buf.size() && len > 0) {
      rcfile = WideToUtf8(buf.get());
    }
  }
#endif

#if defined(_WIN32) && !defined(__CYGWIN__)
  HKEY hKey;
  scoped_fixed_array<wchar_t, BUF_SIZE> v;
  DWORD vt;
  DWORD size = v.size() * sizeof(v[0]);

  if (rcfile.empty()) {
    ::RegOpenKeyExW(HKEY_LOCAL_MACHINE, L"software\\mecab", 0, KEY_READ, &hKey);
    ::RegQueryValueExW(hKey, L"mecabrc", 0, &vt,
                       reinterpret_cast<BYTE *>(v.get()), &size);
    ::RegCloseKey(hKey);
    if (vt == REG_SZ) {
      rcfile = WideToUtf8(v.get());
    }
  }

  if (rcfile.empty()) {
    ::RegOpenKeyExW(HKEY_CURRENT_USER, L"software\\mecab", 0, KEY_READ, &hKey);
    ::RegQueryValueExW(hKey, L"mecabrc", 0, &vt,
                       reinterpret_cast<BYTE *>(v.get()), &size);
    ::RegCloseKey(hKey);
    if (vt == REG_SZ) {
      rcfile = WideToUtf8(v.get());
    }
  }

  if (rcfile.empty()) {
    vt = ::GetModuleFileNameW(DllInstance, v.get(), size);
    if (vt != 0) {
      scoped_fixed_array<wchar_t, _MAX_DRIVE> drive;
      scoped_fixed_array<wchar_t, _MAX_DRIVE> dir;
      _wsplitpath(v.get(), drive.get(), dir.get(), NULL, NULL);
      const std::wstring path =
          std::wstring(drive.get()) + std::wstring(dir.get()) + L"mecabrc";
      if (::GetFileAttributesW(path.c_str()) != -1) {
        rcfile = WideToUtf8(path);
      }
    }
  }
#endif

  if (rcfile.empty()) {
    rcfile = MECAB_DEFAULT_RC;
  }

  if (!param->load(rcfile.c_str())) {
    return false;
  }

  std::string dicdir = param->get<std::string>("dicdir");
  if (dicdir.empty()) {
    dicdir = ".";  // current
  }
  remove_filename(&rcfile);
  replace_string(&dicdir, "$(rcpath)", rcfile);
  param->set<std::string>("dicdir", dicdir, true);
  dicdir = create_filename(dicdir, DICRC);

  if (!param->load(dicdir.c_str())) {
    return false;
  }

  return true;
}
Exemple #2
0
bool Encoder::learn(const char *templfile,
                    const char *trainfile,
                    const char *modelfile,
                    bool textmodelfile,
                    size_t maxitr,
                    size_t freq,
                    double eta,
                    double C,
                    unsigned short thread_num,
                    unsigned short shrinking_size,
                    int algorithm) {
  std::cout << COPYRIGHT << std::endl;

  CHECK_FALSE(eta > 0.0) << "eta must be > 0.0";
  CHECK_FALSE(C >= 0.0) << "C must be >= 0.0";
  CHECK_FALSE(shrinking_size >= 1) << "shrinking-size must be >= 1";
  CHECK_FALSE(thread_num > 0) << "thread must be > 0";

#ifndef CRFPP_USE_THREAD
  CHECK_FALSE(thread_num == 1)
      << "This architecture doesn't support multi-thrading";
#endif

  if (algorithm == MIRA && thread_num > 1) {
    std::cerr <<  "MIRA doesn't support multi-thrading. use thread_num=1"
              << std::endl;
  }

  EncoderFeatureIndex feature_index;
  Allocator allocator(thread_num);
  std::vector<TaggerImpl* > x;
  x.reserve(max_line_nums); //adjust vector.capacity() to accelerate operation:push_back()

  std::cout.setf(std::ios::fixed, std::ios::floatfield);
  std::cout.precision(5);

#define WHAT_ERROR(msg) do {                                    \
    for (std::vector<TaggerImpl *>::iterator it = x.begin();    \
         it != x.end(); ++it)                                   \
      delete *it;                                               \
    std::cerr << msg << std::endl;                              \
    return false; } while (0)

  CHECK_FALSE(feature_index.open(templfile, trainfile))
      << feature_index.what();

  {
    progress_timer pg;

    std::ifstream ifs(WPATH(trainfile));
    CHECK_FALSE(ifs) << "cannot open: " << trainfile;

    std::cout << "reading training data: " << std::flush;
    size_t line = 0;
    while (ifs) {
      TaggerImpl *_x = new TaggerImpl();
      _x->open(&feature_index, &allocator);
      if (!_x->read(&ifs) || !_x->shrink()) {
        WHAT_ERROR(_x->what());
      }

      if (!_x->empty()) {
        x.push_back(_x);
      } else {
        delete _x;
        continue;
      }

      _x->set_thread_id(line % thread_num);

      if (++line % 10000 == 0) {
        std::cout << line << ".. " << std::endl << std::flush;
      }
    }

    ifs.close();
    std::cout << "\nDone!";
  }

  feature_index.shrink(freq, &allocator);

  std::vector <double> alpha(feature_index.size());           // parameter
  std::fill(alpha.begin(), alpha.end(), 0.0);
  feature_index.set_alpha(&alpha[0]);

  std::cout << "Number of sentences: " << x.size() << std::endl;
  std::cout << "Number of features:  " << feature_index.size() << std::endl;
  std::cout << "Number of thread(s): " << thread_num << std::endl;
  std::cout << "Freq:                " << freq << std::endl;
  std::cout << "eta:                 " << eta << std::endl;
  std::cout << "C:                   " << C << std::endl;
  std::cout << "shrinking size:      " << shrinking_size
            << std::endl;

  progress_timer pg;

  switch (algorithm) {
    case MIRA:
      if (!runMIRA(x, &feature_index, &alpha[0],
                   maxitr, C, eta, shrinking_size, thread_num)) {
        WHAT_ERROR("MIRA execute error");
      }
      break;
    case CRF_L2:
      if (!runCRF(x, &feature_index, &alpha[0],
                  maxitr, C, eta, shrinking_size, thread_num, false)) {
        WHAT_ERROR("CRF_L2 execute error");
      }
      break;
    case CRF_L1:
      if (!runCRF(x, &feature_index, &alpha[0],
                  maxitr, C, eta, shrinking_size, thread_num, true)) {
        WHAT_ERROR("CRF_L1 execute error");
      }
      break;
  }

  for (std::vector<TaggerImpl *>::iterator it = x.begin();
       it != x.end(); ++it) {
    delete *it;
  }

  if (!feature_index.save(modelfile, textmodelfile)) {
    WHAT_ERROR(feature_index.what());
  }

  std::cout << "\nDone!";

  return true;
}
bool EncoderFeatureIndex::save(const char *filename,
                               bool textmodelfile) {
  std::vector<char *> key;
  std::vector<int>    val;

  std::string y_str;
  for (size_t i = 0; i < y_.size(); ++i) {
    y_str += y_[i];
    y_str += '\0';
  }

  std::string templ_str;
  for (size_t i = 0; i < unigram_templs_.size(); ++i) {
    templ_str += unigram_templs_[i];
    templ_str += '\0';
  }

  for (size_t i = 0; i < bigram_templs_.size(); ++i) {
    templ_str += bigram_templs_[i];
    templ_str += '\0';
  }

  while ((y_str.size() + templ_str.size()) % 4 != 0) {
    templ_str += '\0';
  }

  for (std::map<std::string, std::pair<int, unsigned int> >::iterator
           it = dic_.begin();
       it != dic_.end(); ++it) {
    key.push_back(const_cast<char *>(it->first.c_str()));
    val.push_back(it->second.first);
  }

  Darts::DoubleArray da;

  CHECK_FALSE(da.build(key.size(), &key[0], 0, &val[0]) == 0)
      << "cannot build double-array";

  std::ofstream bofs;
  bofs.open(WPATH(filename), OUTPUT_MODE);

  CHECK_FALSE(bofs) << "open failed: " << filename;

  unsigned int version_ = version;
  bofs.write(reinterpret_cast<char *>(&version_), sizeof(unsigned int));

  int type = 0;
  bofs.write(reinterpret_cast<char *>(&type), sizeof(type));
  bofs.write(reinterpret_cast<char *>(&cost_factor_), sizeof(cost_factor_));
  bofs.write(reinterpret_cast<char *>(&maxid_), sizeof(maxid_));

  if (max_xsize_ > 0) {
    xsize_ = std::min(xsize_, max_xsize_);
  }
  bofs.write(reinterpret_cast<char *>(&xsize_), sizeof(xsize_));
  unsigned int dsize = da.unit_size() * da.size();
  bofs.write(reinterpret_cast<char *>(&dsize), sizeof(dsize));
  unsigned int size = y_str.size();
  bofs.write(reinterpret_cast<char *>(&size),  sizeof(size));
  bofs.write(const_cast<char *>(y_str.data()), y_str.size());
  size = templ_str.size();
  bofs.write(reinterpret_cast<char *>(&size),  sizeof(size));
  bofs.write(const_cast<char *>(templ_str.data()), templ_str.size());
  bofs.write(reinterpret_cast<const char *>(da.array()), dsize);

  for (size_t i  = 0; i < maxid_; ++i) {
    float alpha = static_cast<float>(alpha_[i]);
    bofs.write(reinterpret_cast<char *>(&alpha), sizeof(alpha));
  }

  bofs.close();

  if (textmodelfile) {
    std::string filename2 = filename;
    filename2 += ".txt";

    std::ofstream tofs(WPATH(filename2.c_str()));

    CHECK_FALSE(tofs) << " no such file or directory: " << filename2;

    // header
    tofs << "version: "     << version_ << std::endl;
    tofs << "cost-factor: " << cost_factor_ << std::endl;
    tofs << "maxid: "       << maxid_ << std::endl;
    tofs << "xsize: "       << xsize_ << std::endl;

    tofs << std::endl;

    // y
    for (size_t i = 0; i < y_.size(); ++i) {
      tofs << y_[i] << std::endl;
    }

    tofs << std::endl;

    // template
    for (size_t i = 0; i < unigram_templs_.size(); ++i) {
      tofs << unigram_templs_[i] << std::endl;
    }

    for (size_t i = 0; i < bigram_templs_.size(); ++i) {
      tofs << bigram_templs_[i] << std::endl;
    }

    tofs << std::endl;

    // dic
    for (std::map<std::string, std::pair<int, unsigned int> >::iterator
             it = dic_.begin();
         it != dic_.end(); ++it) {
      tofs << it->second.first << " " << it->first << std::endl;
    }

    tofs << std::endl;

    tofs.setf(std::ios::fixed, std::ios::floatfield);
    tofs.precision(16);

    for (size_t i  = 0; i < maxid_; ++i) {
      tofs << alpha_[i] << std::endl;
    }
  }

  return true;
}
bool EncoderFeatureIndex::convert(const char *text_filename,
                                  const char *binary_filename) {
  std::ifstream ifs(WPATH(text_filename));

  y_.clear();
  dic_.clear();
  unigram_templs_.clear();
  bigram_templs_.clear();
  xsize_ = 0;
  maxid_ = 0;

  CHECK_FALSE(ifs) << "open failed: " << text_filename;

  scoped_fixed_array<char, 8192> line;
  char *column[8];

  // read header
  while (true) {
    CHECK_FALSE(ifs.getline(line.get(), line.size()))
        << " format error: " << text_filename;

    if (std::strlen(line.get()) == 0) {
      break;
    }

    const size_t size = tokenize(line.get(), "\t ", column, 2);

    CHECK_FALSE(size == 2) << "format error: " << text_filename;

    if (std::strcmp(column[0], "xsize:") == 0) {
      xsize_ = std::atoi(column[1]);
    }

    if (std::strcmp(column[0], "maxid:") == 0) {
      maxid_ = std::atoi(column[1]);
    }
  }

  CHECK_FALSE(maxid_ > 0) << "maxid is not defined: " << text_filename;

  CHECK_FALSE(xsize_ > 0) << "xsize is not defined: " << text_filename;

  while (true) {
    CHECK_FALSE(ifs.getline(line.get(), line.size()))
        << "format error: " << text_filename;
    if (std::strlen(line.get()) == 0) {
      break;
    }
    y_.push_back(line.get());
  }

  while (true) {
    CHECK_FALSE(ifs.getline(line.get(), line.size()))
        << "format error: " << text_filename;
    if (std::strlen(line.get()) == 0) {
      break;
    }
    if (line[0] == 'U') {
      unigram_templs_.push_back(line.get());
    } else if (line[0] == 'B') {
      bigram_templs_.push_back(line.get());
    } else {
      CHECK_FALSE(true) << "unknown type: " << line.get()
                        << " " << text_filename;
    }
  }

  while (true) {
    CHECK_FALSE(ifs.getline(line.get(), line.size()))
        << "format error: " << text_filename;
    if (std::strlen(line.get()) == 0) {
      break;
    }

    const size_t size = tokenize(line.get(), "\t ", column, 2);
    CHECK_FALSE(size == 2) << "format error: " << text_filename;

    dic_.insert(std::make_pair
                (std::string(column[1]),
                 std::make_pair(std::atoi(column[0]),
                                static_cast<unsigned int>(1))));
  }

  std::vector<double> alpha;
  while (ifs.getline(line.get(), line.size())) {
    alpha.push_back(std::atof(line.get()));
  }

  alpha_ = &alpha[0];

  CHECK_FALSE(alpha.size() == maxid_) << " file is broken: "  << text_filename;

  return save(binary_filename, false);
}
Exemple #5
0
bool CharProperty::compile(const char *cfile,
                           const char *ufile,
                           const char *ofile) {
  scoped_fixed_array<char, BUF_SIZE> line;
  scoped_fixed_array<char *, 512> col;
  size_t id = 0;
  std::vector<Range> range;
  std::map<std::string, CharInfo> category;
  std::vector<std::string> category_ary;
  std::ifstream ifs(WPATH(cfile));
  std::istringstream iss(CHAR_PROPERTY_DEF_DEFAULT);
  std::istream *is = &ifs;

  if (!ifs) {
    std::cerr << cfile
              << " is not found. minimum setting is used" << std::endl;
    is = &iss;
  }

  while (is->getline(line.get(), line.size())) {
    if (std::strlen(line.get()) == 0 || line[0] == '#') {
      continue;
    }
    const size_t size = tokenize2(line.get(), "\t ", col.get(), col.size());
    CHECK_DIE(size >= 2) << "format error: " << line.get();

    // 0xFFFF..0xFFFF hoge hoge hgoe #
    if (std::strncmp(col[0], "0x", 2) == 0) {
      std::string low = col[0];
      std::string high;
      size_t pos = low.find("..");

      if (pos != std::string::npos) {
        high = low.substr(pos + 2, low.size() - pos - 2);
        low  = low.substr(0, pos);
      } else {
        high = low;
      }

      Range r;
      r.low = atohex(low.c_str());
      r.high = atohex(high.c_str());

      CHECK_DIE(r.low >= 0 && r.low < 0xffff &&
                r.high >= 0 && r.high < 0xffff &&
                r.low <= r.high)
          << "range error: low=" << r.low << " high=" << r.high;

      for (size_t i = 1; i < size; ++i) {
        if (col[i][0] == '#') {
          break;  // skip comments
        }
        CHECK_DIE(category.find(std::string(col[i])) != category.end())
            << "category [" << col[i] << "] is undefined";
        r.c.push_back(col[i]);
      }
      range.push_back(r);
    } else {
      CHECK_DIE(size >= 4) << "format error: " << line.get();

      std::string key = col[0];
      CHECK_DIE(category.find(key) == category.end())
          << "category " << key << " is already defined";

      CharInfo c;
      std::memset(&c, 0, sizeof(c));
      c.invoke  = std::atoi(col[1]);
      c.group   = std::atoi(col[2]);
      c.length  = std::atoi(col[3]);
      c.default_type = id++;

      category.insert(std::pair<std::string, CharInfo>(key, c));
      category_ary.push_back(key);
    }
  }

  CHECK_DIE(category.size() < 18) << "too many categories(>= 18)";

  CHECK_DIE(category.find("DEFAULT") != category.end())
      << "category [DEFAULT] is undefined";

  CHECK_DIE(category.find("SPACE") != category.end())
      << "category [SPACE] is undefined";

  std::istringstream iss2(UNK_DEF_DEFAULT);
  std::ifstream ifs2(WPATH(ufile));
  std::istream *is2 = &ifs2;

  if (!ifs2) {
    std::cerr << ufile
              << " is not found. minimum setting is used." << std::endl;
    is2 = &iss2;
  }

  std::set<std::string> unk;
  while (is2->getline(line.get(), line.size())) {
    const size_t n = tokenizeCSV(line.get(), col.get(), 2);
    CHECK_DIE(n >= 1) << "format error: " << line.get();
    const std::string key = col[0];
    CHECK_DIE(category.find(key) != category.end())
        << "category [" << key << "] is undefined in " << cfile;
    unk.insert(key);
  }

  for (std::map<std::string, CharInfo>::const_iterator it = category.begin();
       it != category.end();
       ++it) {
    CHECK_DIE(unk.find(it->first) != unk.end())
        << "category [" << it->first << "] is undefined in " << ufile;
  }

  std::vector<CharInfo> table(0xffff);
  {
    std::vector<std::string> tmp;
    tmp.push_back("DEFAULT");
    const CharInfo c = encode(tmp, &category);
    std::fill(table.begin(), table.end(), c);
  }

  for (std::vector<Range>::const_iterator it = range.begin();
       it != range.end();
       ++it) {
    const CharInfo c = encode(it->c, &category);
    for (int i = it->low; i <= it->high; ++i) {
      table[i] = c;
    }
  }

  // output binary table
  {
    std::ofstream ofs(WPATH(ofile), std::ios::binary|std::ios::out);
    CHECK_DIE(ofs) << "permission denied: " << ofile;

    unsigned int size = static_cast<unsigned int>(category.size());
    ofs.write(reinterpret_cast<const char*>(&size), sizeof(size));
    for (std::vector<std::string>::const_iterator it = category_ary.begin();
         it != category_ary.end();
         ++it) {
      char buf[32];
      std::fill(buf, buf + sizeof(buf), '\0');
      std::strncpy(buf, it->c_str(), sizeof(buf) - 1);
      ofs.write(reinterpret_cast<const char*>(buf), sizeof(buf));
    }
    ofs.write(reinterpret_cast<const char*>(&table[0]),
              sizeof(CharInfo) * table.size());
    ofs.close();
  }

  return true;
}
Exemple #6
0
  static bool eval(int argc, char **argv) {
    static const MeCab::Option long_options[] = {
      { "level",  'l',  "0 -1",    "STR",    "set level of evaluations" },
      { "output", 'o',  0,         "FILE",   "set the output file name" },
      { "version",  'v',  0,   0,    "show the version and exit"   },
      { "help",  'h',  0,   0,    "show this help and exit."   },
      { 0, 0, 0, 0 }
    };

    MeCab::Param param;
    param.open(argc, argv, long_options);

    if (!param.open(argc, argv, long_options)) {
      std::cout << param.what() << "\n\n" <<  COPYRIGHT
                << "\ntry '--help' for more information." << std::endl;
#if 1 /* for Open JTalk */
      return false;
#else
      return -1;
#endif
    }

    if (!param.help_version()) return 0;

    const std::vector<std::string> &files = param.rest_args();
    if (files.size() < 2) {
      std::cout << "Usage: " <<
          param.program_name() << " output answer" << std::endl;
#if 1 /* for Open JTalk */
      return false;
#else
      return -1;
#endif
    }

    std::string output = param.get<std::string>("output");
    if (output.empty()) output = "-";
    MeCab::ostream_wrapper ofs(output.c_str());
    CHECK_DIE(*ofs) << "no such file or directory: " << output;

    const std::string system = files[0];
    const std::string answer = files[1];

    const std::string level_str = param.get<std::string>("level");

    std::ifstream ifs1(WPATH(files[0].c_str()));
    std::ifstream ifs2(WPATH(files[1].c_str()));

    CHECK_DIE(ifs1) << "no such file or directory: " << files[0].c_str();
    CHECK_DIE(ifs2) << "no such file or directory: " << files[0].c_str();
    CHECK_DIE(!level_str.empty()) << "level_str is NULL";

    std::vector<int> level;
    parseLevel(level_str.c_str(), &level);
    CHECK_DIE(level.size()) << "level_str is empty: " << level_str;
    std::vector<size_t> result_tbl(level.size());
    std::fill(result_tbl.begin(), result_tbl.end(), 0);

    size_t prec = 0;
    size_t recall = 0;

    std::vector<std::vector<std::string> > r1;
    std::vector<std::vector<std::string> > r2;

    while (true) {
      if (!read(&ifs1, &r1, level) || !read(&ifs2, &r2, level))
        break;

      size_t i1 = 0;
      size_t i2 = 0;
      size_t p1 = 0;
      size_t p2 = 0;

      while (i1 < r1.size() && i2 < r2.size()) {
        if (p1 == p2) {
          for (size_t i = 0; i < result_tbl.size(); ++i) {
            if (r1[i1][i] == r2[i2][i]) {
              result_tbl[i]++;
            }
          }
          p1 += r1[i1][0].size();
          p2 += r2[i2][0].size();
          ++i1;
          ++i2;
          ++prec;
          ++recall;
        } else if (p1 < p2) {
          p1 += r1[i1][0].size();
          ++i1;
          ++prec;
        } else {
          p2 += r2[i2][0].size();
          ++i2;
          ++recall;
        }
      }

      while (i1 < r1.size()) {
        ++prec;
        ++i1;
      }

      while (i2 < r2.size()) {
        ++recall;
        ++i2;
      }
    }

    *ofs <<  "              precision          recall         F"
         << std::endl;
    for (size_t i = 0; i < result_tbl.size(); ++i) {
      if (level[i] == -1) {
        *ofs << "LEVEL ALL: ";
      } else {
        *ofs << "LEVEL " << level[i] << ":    ";
      }
      printeval(&*ofs, result_tbl[i], prec, recall);
    }

    return true;
  }