Esempio n. 1
0
File: ne.cpp Progetto: humem/cabocha
bool NE::open(const Param &param) {
  close();

  if (action_mode() == PARSING_MODE) {
    const std::string filename = param.get<std::string>("ne-model");
    std::vector<const char*> argv;
    argv.push_back(param.program_name());
    argv.push_back("-m");
    argv.push_back(filename.c_str());
    model_ = crfpp_model_new(argv.size(),
                              const_cast<char **>(&argv[0]));
    // CHECK_FALSE(tagger_) << crfpp_strerror(tagger_);
    // CHECK_FALSE(crfpp_ysize(tagger_) >= 2);
    // CHECK_FALSE(crfpp_xsize(tagger_) == 3);
    // for (size_t i = 0; i < crfpp_ysize(tagger_); ++i) {
    //   const char *p = crfpp_yname(tagger_, i);
    //   CHECK_FALSE(p && (p[0] == 'B' || p[0] == 'I' || p[0] == 'O'));
    // }
  }

  // "名詞,数,"
  ne_composite_ipa_   ="\xE5\x90\x8D\xE8\xA9\x9E,\xE6\x95\xB0,";
  // "名詞,数詞"
  ne_composite_juman_ = "\xE5\x90\x8D\xE8\xA9\x9E,\xE6\x95\xB0\xE8\xA9\x9E,";
  // "名詞,数詞"
  ne_composite_unidic_ = "\xE5\x90\x8D\xE8\xA9\x9E,\xE6\x95\xB0\xE8\xA9\x9E,";

  Iconv iconv;
  iconv.open(UTF8, charset());
  CHECK_DIE(iconv.convert(&ne_composite_ipa_));
  CHECK_DIE(iconv.convert(&ne_composite_juman_));
  CHECK_DIE(iconv.convert(&ne_composite_unidic_));
  CHECK_FALSE(!ne_composite_ipa_.empty());
  CHECK_FALSE(!ne_composite_juman_.empty());
  CHECK_FALSE(!ne_composite_unidic_.empty());

  return true;
}
bool Dictionary::compile(const Param &param,
                         const std::vector<std::string> &dics,
                         const char *matrix_file,
                         const char *matrix_bin_file,
                         const char *left_id_file,
                         const char *right_id_file,
                         const char *rewrite_file,
                         const char *pos_id_file,
                         const char *output) {
  Connector matrix;
  scoped_ptr<DictionaryRewriter> rewrite(0);
  scoped_ptr<POSIDGenerator> posid(0);
  scoped_ptr<ContextID> cid(0);
  scoped_ptr<Writer> writer(0);
  scoped_ptr<StringBuffer> os(0);
  Node node;

  std::vector<std::pair<std::string, Token*> > dic;

  size_t offset  = 0;
  unsigned int lexsize = 0;
  std::string w, feature, ufeature, lfeature, rfeature, fbuf, key;
  int lid, rid, cost;

  const std::string from = param.get<std::string>("dictionary-charset");
  const std::string to = param.get<std::string>("charset");
  const bool wakati = param.get<bool>("wakati");
  const int type = param.get<int>("type");
  const std::string node_format = param.get<std::string>("node-format");

  // for backward compatibility
  std::string config_charset = param.get<std::string>("config-charset");
  if (config_charset.empty()) config_charset = from;

  CHECK_DIE(!from.empty()) << "input dictionary charset is empty";
  CHECK_DIE(!to.empty())   << "output dictionary charset is empty";

  Iconv iconv;
  CHECK_DIE(iconv.open(from.c_str(), to.c_str()))
      << "iconv_open() failed with from=" << from << " to=" << to;

  Iconv config_iconv;
  CHECK_DIE(config_iconv.open(config_charset.c_str(), from.c_str()))
      << "iconv_open() failed with from=" << config_charset << " to=" << from;

  if (!node_format.empty()) {
    writer.reset(new Writer);
    os.reset(new StringBuffer);
    memset(&node, 0, sizeof(node));
  }

  if (!matrix.openText(matrix_file) &&
      !matrix.open(matrix_bin_file)) {
    matrix.set_left_size(1);
    matrix.set_right_size(1);
  }

  posid.reset(new POSIDGenerator);
  posid->open(pos_id_file, &config_iconv);

  std::istringstream iss(UNK_DEF_DEFAULT);

  for (size_t i = 0; i < dics.size(); ++i) {
    std::ifstream ifs(dics[i].c_str());
    std::istream *is = &ifs;
    if (!ifs) {
      if (type == MECAB_UNK_DIC) {
        std::cerr << dics[i]
                  << " is not found. minimum setting is used." << std::endl;
        is = &iss;
      } else {
        CHECK_DIE(ifs) << "no such file or directory: " << dics[i];
      }
    }

    std::cout << "reading " << dics[i] << " ... ";

    char line[BUF_SIZE];
    size_t num = 0;

    while (is->getline(line, sizeof(line))) {
      char *col[8];
      const size_t n = tokenizeCSV(line, col, 5);
      CHECK_DIE(n == 5) << "format error: " << line;

      w = col[0];
      lid = std::atoi(col[1]);
      rid = std::atoi(col[2]);
      cost = std::atoi(col[3]);
      feature = col[4];
      int pid = posid->id(feature.c_str());

      if (lid < 0  || rid < 0) {
        if (!rewrite.get()) {
          rewrite.reset(new DictionaryRewriter);
          rewrite->open(rewrite_file, &config_iconv);
        }

        CHECK_DIE(rewrite->rewrite(feature, &ufeature, &lfeature, &rfeature))
            << "rewrite failed: " << feature;

        if (!cid.get()) {
          cid.reset(new ContextID);
          cid->open(left_id_file, right_id_file, &config_iconv);
          CHECK_DIE(cid->left_size()  == matrix.left_size() &&
                    cid->right_size() == matrix.right_size())
              << "Context ID files("
              << left_id_file
              << " or "
              << right_id_file << " may be broken";
        }

        lid = cid->lid(lfeature.c_str());
        rid = cid->rid(rfeature.c_str());
      }

      CHECK_DIE(lid >= 0 && rid >= 0 && matrix.is_valid(lid, rid))
          << "invalid ids are found lid=" << lid << " rid=" << rid;

      if (w.empty()) {
        std::cerr << "empty word is found, discard this line" << std::endl;
        continue;
      }

      if (!iconv.convert(&feature)) {
        std::cerr << "iconv conversion failed. skip this entry"
                  << std::endl;
        continue;
      }

      if (type != MECAB_UNK_DIC && !iconv.convert(&w)) {
        std::cerr << "iconv conversion failed. skip this entry"
                  << std::endl;
        continue;
      }

      if (!node_format.empty()) {
        node.surface = w.c_str();
        node.feature = feature.c_str();
        node.length  = w.size();
        node.rlength = w.size();
        node.posid   = pid;
        node.stat    = MECAB_NOR_NODE;
        CHECK_DIE(os.get());
        CHECK_DIE(writer.get());
        os->clear();
        CHECK_DIE(writer->writeNode(&*os,
                                    node_format.c_str(),
                                    w.c_str(),
                                    &node)) <<
            "conversion error: " << feature << " with " << node_format;
        *os << '\0';
        feature = os->str();
      }

      key.clear();
      if (!wakati) key = feature + '\0';

      Token* token  = new Token;
      token->lcAttr = lid;
      token->rcAttr = rid;
      token->posid  = pid;
      token->wcost = cost;
      token->feature = offset;
      token->compound = 0;
      dic.push_back(std::make_pair<std::string, Token*>(w, token));

      // append to output buffer
      if (!wakati) fbuf.append(key.data(), key.size());
      offset += key.size();

      ++num;
      ++lexsize;
    }

    std::cout << num << std::endl;
  }

  if (wakati) fbuf.append("\0", 1);

  std::sort(dic.begin(), dic.end());

  size_t bsize = 0;
  size_t idx = 0;
  std::string prev;
  std::vector<const char *> str;
  std::vector<size_t> len;
  std::vector<Darts::DoubleArray::result_type> val;

  for (size_t i = 0; i < dic.size(); ++i) {
    if (i != 0 && prev != dic[i].first) {
      str.push_back(dic[idx].first.c_str());
      len.push_back(dic[idx].first.size());
      val.push_back(bsize +(idx << 8));
      bsize = 1;
      idx = i;
    } else {
      ++bsize;
    }
    prev = dic[i].first;
  }
  str.push_back(dic[idx].first.c_str());
  len.push_back(dic[idx].first.size());
  val.push_back(bsize +(idx << 8));

  CHECK_DIE(str.size() == len.size());
  CHECK_DIE(str.size() == val.size());

  Darts::DoubleArray da;
  CHECK_DIE(da.build(str.size(), const_cast<char **>(&str[0]),
                     &len[0], &val[0], &progress_bar_darts) == 0)
      << "unkown error in building double-array";

  std::string tbuf;
  for (size_t i = 0; i < dic.size(); ++i) {
    tbuf.append(reinterpret_cast<const char*>(dic[i].second),
                sizeof(Token));
    delete dic[i].second;
  }
  dic.clear();

  // needs to be 8byte(64bit) aligned
  while (tbuf.size() % 8 != 0) {
    Token dummy;
    memset(&dummy, 0, sizeof(Token));
    tbuf.append(reinterpret_cast<const char*>(&dummy), sizeof(Token));
  }

  unsigned int dummy = 0;
  unsigned int lsize = matrix.left_size();
  unsigned int rsize = matrix.right_size();
  unsigned int dsize = da.unit_size() * da.size();
  unsigned int tsize = tbuf.size();
  unsigned int fsize = fbuf.size();

  unsigned int version = DIC_VERSION;
  char charset[32];
  std::fill(charset, charset + sizeof(charset), '\0');
  std::strncpy(charset, to.c_str(), 31);

  std::ofstream bofs(output, std::ios::binary|std::ios::out);
  CHECK_DIE(bofs) << "permission denied: " << output;

  unsigned int magic = 0;

  // needs to be 64bit aligned
  // 10*32 = 64*5
  bofs.write(reinterpret_cast<const char *>(&magic),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&version), sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&type),    sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&lexsize), sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&lsize),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&rsize),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&dsize),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&tsize),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&fsize),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&dummy),   sizeof(unsigned int));

  // 32 * 8 = 64 * 4
  bofs.write(reinterpret_cast<const char *>(charset),  sizeof(charset));

  bofs.write(reinterpret_cast<const char*>(da.array()),
             da.unit_size() * da.size());
  bofs.write(const_cast<const char *>(tbuf.data()), tbuf.size());
  bofs.write(const_cast<const char *>(fbuf.data()), fbuf.size());

  // save magic id
  magic = static_cast<unsigned int>(bofs.tellp());
  magic ^= DictionaryMagicID;
  bofs.seekp(0);
  bofs.write(reinterpret_cast<const char *>(&magic), sizeof(unsigned int));

  bofs.close();

  return true;
}