bool FeatureIndex::convert(const char* txtfile, const char *binfile) {
  std::ifstream ifs(txtfile);

  CHECK_DIE(ifs) << "no such file or directory: " << txtfile;

  char buf[BUF_SIZE];
  char *column[4];
  std::map<std::string, double> dic;

  while (ifs.getline(buf, sizeof(buf))) {
    CHECK_DIE(tokenize2(buf, "\t", column, 2) == 2)
        << "format error: " << buf;

    dic.insert(std::make_pair<std::string, double>
               (std::string(column[1]), atof(column[0]) ));
  }

  std::ofstream ofs(binfile, std::ios::out | std::ios::binary);
  CHECK_DIE(ofs) << "permission denied: " << binfile;

  std::vector<char *> key;
  unsigned int size = static_cast<unsigned int>(dic.size());
  ofs.write(reinterpret_cast<const char*>(&size), sizeof(unsigned int));

  for (std::map<std::string, double>::const_iterator
           it = dic.begin(); it != dic.end(); ++it) {
    key.push_back(const_cast<char*>(it->first.c_str()));
    ofs.write(reinterpret_cast<const char*>(&it->second), sizeof(double));
  }

  Darts::DoubleArray da;
  CHECK_DIE(da.build(key.size(), &key[0], 0, 0, 0) == 0)
      << "unkown error in building double array: " << binfile;

  ofs.write(reinterpret_cast<const char*>(da.array()),
            da.unit_size() * da.size());

  return true;
}
Exemple #2
0
int main (int argc, char **argv)
{
  std::string file  = "";
  std::string index = "";
  std::string ofile = "";
  extern char *optarg;

  int opt;
  while ((opt = getopt(argc, argv, "i:o:O:")) != -1) {
    switch(opt) {
    case 'i':
      file = std::string (optarg);
      break;
    case 'o':
      index = std::string (optarg);
      break;
    case 'O':
      ofile = std::string (optarg);
       break;
     default:
      std::cout << "Usage: " << argv[0] << OPT << std::endl;
      return -1;
    }
  }

  if (file.empty () || index.empty ()) {
    std::cout << "Usage: " << argv[0] << OPT << std::endl;
    return -1;
  }

  std::istream *is;
  if (file == "-")  is = &std::cin;
  else              is = new std::ifstream (file.c_str());

  if (! *is) {
    std::cerr << "Cannot Open: " << file << std::endl;
    return -1;
  }

  std::vector <Darts::DoubleArray::key_type *> ary;
  std::vector <std::pair<const char *, double> > ary2;
  std::vector <double> alpha;
  std::map<std::string, double> rules;

  char buf[8192];
  char *column[2];
  double bias = 0.0;
  double alpha_sum = 0.0;
  double l1_norm = 0.0;
  double l2_norm = 0.0;

	while (is->getline (buf, 8192)) {
	  	if (buf[strlen(buf) - 1] == '\r') {
	  		buf[strlen(buf) - 1] = '\0';
	  	}

	  	//cout << "\nline:" << no_cr_line;
      	//cout.flush();
    	if (2 != tokenize (buf, "\t ", column, 2)) {
	 		std::cerr << "FATAL: Format Error: " << buf << std::endl;
	 		return -1;
      	}
    	// Ignore rules containing only 1 character.
    	//if (strlen(column[1]) <= 1) continue;

		double a = atof (column[0]);
    	bias -= a;
    	alpha_sum += std::abs (a);
    	rules[column[1]] += 2 * a;
  	}

  bias /= alpha_sum;
  //bias = 0;
 l1_norm = alpha_sum;

  for (std::map<std::string, double>::iterator it = rules.begin(); it != rules.end(); ++it) {
    double a = it->second / alpha_sum;
    l2_norm += 	pow(it->second, 2);

    ary2.push_back (std::make_pair <const char*, double>(it->first.c_str(), a));
    ary.push_back  ((Darts::DoubleArray::key_type *)it->first.c_str());
    alpha.push_back (a);
  }

  l2_norm = pow(l2_norm, 0.5);

  std::cout << "Total: " << alpha.size() << " rule(s)" << std::endl;
  std::cout << "l1_norm: " << l1_norm << ", l2_norm: " << l2_norm << std::endl;

  if (ary.empty()) {
    std::cerr << "FATAL: no feature is added" << std::endl;
    return -1;
  }

  if (file != "-") delete is;

  Darts::DoubleArray da;

  if (da.build (ary.size(), &ary[0], 0, 0, 0) != 0) {
    std::cerr << "Error: cannot build double array  " << file << std::endl;
    return -1;
  }

  std::ofstream ofs (index.c_str(), std::ios::binary|std::ios::out);

  if (!ofs) {
    std::cerr << "Error: cannot open " << index << std::endl;
    return -1;
  }

  unsigned int s = da.size() * da.unit_size();
  ofs.write ((char *)&s, sizeof (unsigned));
  ofs.write ((char *)da.array (), s);
  ofs.write ((char *)&bias, sizeof (double));
  ofs.write ((char *)&alpha[0], sizeof (double) * alpha.size());
  ofs.close ();

  if (! ary2.empty() && ! ofile.empty()) {
    std::ofstream ofs2 (ofile.c_str());
    if (! ofs2) {
       std::cerr << "Cannot Open: " << ofile << std::endl;
       return -1;
    }
    ofs2.precision (24);
    ofs2 << bias << std::endl;
    std::sort (ary2.begin(), ary2.end(), pair_2nd_cmp <const char*, double>());
    for (unsigned int i = 0; i < ary2.size (); ++i) ofs2 << ary2[i].second << " " << ary2[i].first << std::endl;
  }

  return 0;
}
bool EncoderFeatureIndex::save(const char *filename,
                               bool textmodelfile) {
  std::vector<char *> key;
  std::vector<int>    val;

  std::string y_str;
  for (size_t i = 0; i < y_.size(); ++i) {
    y_str += y_[i];
    y_str += '\0';
  }

  std::string templ_str;
  for (size_t i = 0; i < unigram_templs_.size(); ++i) {
    templ_str += unigram_templs_[i];
    templ_str += '\0';
  }

  for (size_t i = 0; i < bigram_templs_.size(); ++i) {
    templ_str += bigram_templs_[i];
    templ_str += '\0';
  }

  while ((y_str.size() + templ_str.size()) % 4 != 0) {
    templ_str += '\0';
  }

  for (std::map<std::string, std::pair<int, unsigned int> >::iterator
           it = dic_.begin();
       it != dic_.end(); ++it) {
    key.push_back(const_cast<char *>(it->first.c_str()));
    val.push_back(it->second.first);
  }

  Darts::DoubleArray da;

  CHECK_FALSE(da.build(key.size(), &key[0], 0, &val[0]) == 0)
      << "cannot build double-array";

  std::ofstream bofs;
  bofs.open(WPATH(filename), OUTPUT_MODE);

  CHECK_FALSE(bofs) << "open failed: " << filename;

  unsigned int version_ = version;
  bofs.write(reinterpret_cast<char *>(&version_), sizeof(unsigned int));

  int type = 0;
  bofs.write(reinterpret_cast<char *>(&type), sizeof(type));
  bofs.write(reinterpret_cast<char *>(&cost_factor_), sizeof(cost_factor_));
  bofs.write(reinterpret_cast<char *>(&maxid_), sizeof(maxid_));

  if (max_xsize_ > 0) {
    xsize_ = std::min(xsize_, max_xsize_);
  }
  bofs.write(reinterpret_cast<char *>(&xsize_), sizeof(xsize_));
  unsigned int dsize = da.unit_size() * da.size();
  bofs.write(reinterpret_cast<char *>(&dsize), sizeof(dsize));
  unsigned int size = y_str.size();
  bofs.write(reinterpret_cast<char *>(&size),  sizeof(size));
  bofs.write(const_cast<char *>(y_str.data()), y_str.size());
  size = templ_str.size();
  bofs.write(reinterpret_cast<char *>(&size),  sizeof(size));
  bofs.write(const_cast<char *>(templ_str.data()), templ_str.size());
  bofs.write(reinterpret_cast<const char *>(da.array()), dsize);

  for (size_t i  = 0; i < maxid_; ++i) {
    float alpha = static_cast<float>(alpha_[i]);
    bofs.write(reinterpret_cast<char *>(&alpha), sizeof(alpha));
  }

  bofs.close();

  if (textmodelfile) {
    std::string filename2 = filename;
    filename2 += ".txt";

    std::ofstream tofs(WPATH(filename2.c_str()));

    CHECK_FALSE(tofs) << " no such file or directory: " << filename2;

    // header
    tofs << "version: "     << version_ << std::endl;
    tofs << "cost-factor: " << cost_factor_ << std::endl;
    tofs << "maxid: "       << maxid_ << std::endl;
    tofs << "xsize: "       << xsize_ << std::endl;

    tofs << std::endl;

    // y
    for (size_t i = 0; i < y_.size(); ++i) {
      tofs << y_[i] << std::endl;
    }

    tofs << std::endl;

    // template
    for (size_t i = 0; i < unigram_templs_.size(); ++i) {
      tofs << unigram_templs_[i] << std::endl;
    }

    for (size_t i = 0; i < bigram_templs_.size(); ++i) {
      tofs << bigram_templs_[i] << std::endl;
    }

    tofs << std::endl;

    // dic
    for (std::map<std::string, std::pair<int, unsigned int> >::iterator
             it = dic_.begin();
         it != dic_.end(); ++it) {
      tofs << it->second.first << " " << it->first << std::endl;
    }

    tofs << std::endl;

    tofs.setf(std::ios::fixed, std::ios::floatfield);
    tofs.precision(16);

    for (size_t i  = 0; i < maxid_; ++i) {
      tofs << alpha_[i] << std::endl;
    }
  }

  return true;
}
bool Dictionary::compile(const Param &param,
                         const std::vector<std::string> &dics,
                         const char *matrix_file,
                         const char *matrix_bin_file,
                         const char *left_id_file,
                         const char *right_id_file,
                         const char *rewrite_file,
                         const char *pos_id_file,
                         const char *output) {
  Connector matrix;
  scoped_ptr<DictionaryRewriter> rewrite(0);
  scoped_ptr<POSIDGenerator> posid(0);
  scoped_ptr<ContextID> cid(0);
  scoped_ptr<Writer> writer(0);
  scoped_ptr<StringBuffer> os(0);
  Node node;

  std::vector<std::pair<std::string, Token*> > dic;

  size_t offset  = 0;
  unsigned int lexsize = 0;
  std::string w, feature, ufeature, lfeature, rfeature, fbuf, key;
  int lid, rid, cost;

  const std::string from = param.get<std::string>("dictionary-charset");
  const std::string to = param.get<std::string>("charset");
  const bool wakati = param.get<bool>("wakati");
  const int type = param.get<int>("type");
  const std::string node_format = param.get<std::string>("node-format");

  // for backward compatibility
  std::string config_charset = param.get<std::string>("config-charset");
  if (config_charset.empty()) config_charset = from;

  CHECK_DIE(!from.empty()) << "input dictionary charset is empty";
  CHECK_DIE(!to.empty())   << "output dictionary charset is empty";

  Iconv iconv;
  CHECK_DIE(iconv.open(from.c_str(), to.c_str()))
      << "iconv_open() failed with from=" << from << " to=" << to;

  Iconv config_iconv;
  CHECK_DIE(config_iconv.open(config_charset.c_str(), from.c_str()))
      << "iconv_open() failed with from=" << config_charset << " to=" << from;

  if (!node_format.empty()) {
    writer.reset(new Writer);
    os.reset(new StringBuffer);
    memset(&node, 0, sizeof(node));
  }

  if (!matrix.openText(matrix_file) &&
      !matrix.open(matrix_bin_file)) {
    matrix.set_left_size(1);
    matrix.set_right_size(1);
  }

  posid.reset(new POSIDGenerator);
  posid->open(pos_id_file, &config_iconv);

  std::istringstream iss(UNK_DEF_DEFAULT);

  for (size_t i = 0; i < dics.size(); ++i) {
    std::ifstream ifs(dics[i].c_str());
    std::istream *is = &ifs;
    if (!ifs) {
      if (type == MECAB_UNK_DIC) {
        std::cerr << dics[i]
                  << " is not found. minimum setting is used." << std::endl;
        is = &iss;
      } else {
        CHECK_DIE(ifs) << "no such file or directory: " << dics[i];
      }
    }

    std::cout << "reading " << dics[i] << " ... ";

    char line[BUF_SIZE];
    size_t num = 0;

    while (is->getline(line, sizeof(line))) {
      char *col[8];
      const size_t n = tokenizeCSV(line, col, 5);
      CHECK_DIE(n == 5) << "format error: " << line;

      w = col[0];
      lid = std::atoi(col[1]);
      rid = std::atoi(col[2]);
      cost = std::atoi(col[3]);
      feature = col[4];
      int pid = posid->id(feature.c_str());

      if (lid < 0  || rid < 0) {
        if (!rewrite.get()) {
          rewrite.reset(new DictionaryRewriter);
          rewrite->open(rewrite_file, &config_iconv);
        }

        CHECK_DIE(rewrite->rewrite(feature, &ufeature, &lfeature, &rfeature))
            << "rewrite failed: " << feature;

        if (!cid.get()) {
          cid.reset(new ContextID);
          cid->open(left_id_file, right_id_file, &config_iconv);
          CHECK_DIE(cid->left_size()  == matrix.left_size() &&
                    cid->right_size() == matrix.right_size())
              << "Context ID files("
              << left_id_file
              << " or "
              << right_id_file << " may be broken";
        }

        lid = cid->lid(lfeature.c_str());
        rid = cid->rid(rfeature.c_str());
      }

      CHECK_DIE(lid >= 0 && rid >= 0 && matrix.is_valid(lid, rid))
          << "invalid ids are found lid=" << lid << " rid=" << rid;

      if (w.empty()) {
        std::cerr << "empty word is found, discard this line" << std::endl;
        continue;
      }

      if (!iconv.convert(&feature)) {
        std::cerr << "iconv conversion failed. skip this entry"
                  << std::endl;
        continue;
      }

      if (type != MECAB_UNK_DIC && !iconv.convert(&w)) {
        std::cerr << "iconv conversion failed. skip this entry"
                  << std::endl;
        continue;
      }

      if (!node_format.empty()) {
        node.surface = w.c_str();
        node.feature = feature.c_str();
        node.length  = w.size();
        node.rlength = w.size();
        node.posid   = pid;
        node.stat    = MECAB_NOR_NODE;
        CHECK_DIE(os.get());
        CHECK_DIE(writer.get());
        os->clear();
        CHECK_DIE(writer->writeNode(&*os,
                                    node_format.c_str(),
                                    w.c_str(),
                                    &node)) <<
            "conversion error: " << feature << " with " << node_format;
        *os << '\0';
        feature = os->str();
      }

      key.clear();
      if (!wakati) key = feature + '\0';

      Token* token  = new Token;
      token->lcAttr = lid;
      token->rcAttr = rid;
      token->posid  = pid;
      token->wcost = cost;
      token->feature = offset;
      token->compound = 0;
      dic.push_back(std::make_pair<std::string, Token*>(w, token));

      // append to output buffer
      if (!wakati) fbuf.append(key.data(), key.size());
      offset += key.size();

      ++num;
      ++lexsize;
    }

    std::cout << num << std::endl;
  }

  if (wakati) fbuf.append("\0", 1);

  std::sort(dic.begin(), dic.end());

  size_t bsize = 0;
  size_t idx = 0;
  std::string prev;
  std::vector<const char *> str;
  std::vector<size_t> len;
  std::vector<Darts::DoubleArray::result_type> val;

  for (size_t i = 0; i < dic.size(); ++i) {
    if (i != 0 && prev != dic[i].first) {
      str.push_back(dic[idx].first.c_str());
      len.push_back(dic[idx].first.size());
      val.push_back(bsize +(idx << 8));
      bsize = 1;
      idx = i;
    } else {
      ++bsize;
    }
    prev = dic[i].first;
  }
  str.push_back(dic[idx].first.c_str());
  len.push_back(dic[idx].first.size());
  val.push_back(bsize +(idx << 8));

  CHECK_DIE(str.size() == len.size());
  CHECK_DIE(str.size() == val.size());

  Darts::DoubleArray da;
  CHECK_DIE(da.build(str.size(), const_cast<char **>(&str[0]),
                     &len[0], &val[0], &progress_bar_darts) == 0)
      << "unkown error in building double-array";

  std::string tbuf;
  for (size_t i = 0; i < dic.size(); ++i) {
    tbuf.append(reinterpret_cast<const char*>(dic[i].second),
                sizeof(Token));
    delete dic[i].second;
  }
  dic.clear();

  // needs to be 8byte(64bit) aligned
  while (tbuf.size() % 8 != 0) {
    Token dummy;
    memset(&dummy, 0, sizeof(Token));
    tbuf.append(reinterpret_cast<const char*>(&dummy), sizeof(Token));
  }

  unsigned int dummy = 0;
  unsigned int lsize = matrix.left_size();
  unsigned int rsize = matrix.right_size();
  unsigned int dsize = da.unit_size() * da.size();
  unsigned int tsize = tbuf.size();
  unsigned int fsize = fbuf.size();

  unsigned int version = DIC_VERSION;
  char charset[32];
  std::fill(charset, charset + sizeof(charset), '\0');
  std::strncpy(charset, to.c_str(), 31);

  std::ofstream bofs(output, std::ios::binary|std::ios::out);
  CHECK_DIE(bofs) << "permission denied: " << output;

  unsigned int magic = 0;

  // needs to be 64bit aligned
  // 10*32 = 64*5
  bofs.write(reinterpret_cast<const char *>(&magic),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&version), sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&type),    sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&lexsize), sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&lsize),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&rsize),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&dsize),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&tsize),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&fsize),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&dummy),   sizeof(unsigned int));

  // 32 * 8 = 64 * 4
  bofs.write(reinterpret_cast<const char *>(charset),  sizeof(charset));

  bofs.write(reinterpret_cast<const char*>(da.array()),
             da.unit_size() * da.size());
  bofs.write(const_cast<const char *>(tbuf.data()), tbuf.size());
  bofs.write(const_cast<const char *>(fbuf.data()), fbuf.size());

  // save magic id
  magic = static_cast<unsigned int>(bofs.tellp());
  magic ^= DictionaryMagicID;
  bofs.seekp(0);
  bofs.write(reinterpret_cast<const char *>(&magic), sizeof(unsigned int));

  bofs.close();

  return true;
}