C++ (Cpp) tokenizeCSV Beispiele

Beispiel #1

0

Datei anzeigen

Datei: feature_index.cpp Projekt: STRatANG/MMDAgentExperimentEnvironment

bool FeatureIndex::buildBigramFeature(LearnerPath *path,
                                      const char *rfeature,
                                      const char *lfeature) {
  char rbuf[BUFSIZE];
  char lbuf[BUFSIZE];
  char *R[POSSIZE];
  char *L[POSSIZE];

  feature_.clear();
  std::strncpy(lbuf,  rfeature, BUFSIZE);
  std::strncpy(rbuf,  lfeature, BUFSIZE);

  size_t lsize = tokenizeCSV(lbuf, L, POSSIZE);
  size_t rsize = tokenizeCSV(rbuf, R, POSSIZE);

  for (std::vector<const char*>::const_iterator it = bigram_templs_.begin();
       it != bigram_templs_.end(); ++it) {
    const char *p = *it;
    os_.clear();

    for (; *p; p++) {
      switch (*p) {
        default: os_ << *p; break;
        case '\\': os_ << getEscapedChar(*++p); break;
        case '%': {
          switch (*++p) {
            case 'L': {
              const char *r = getIndex(const_cast<char **>(&p), L, lsize);
              if (!r) goto NEXT;
              os_ << r;
            } break;
            case 'R': {
              const char *r = getIndex(const_cast<char **>(&p), R, rsize);
              if (!r) goto NEXT;
              os_ << r;
            } break;
            case 'l':  os_ << lfeature; break;  // use lfeature as it is
            case 'r':  os_ << rfeature; break;
            default:
              CHECK_FALSE(false) << "unkonwn meta char: " <<  *p;
          }
        }
      }
    }

    os_ << '\0';

    ADDB(os_.str());

 NEXT: continue;
  }

  COPY_FEATURE(path->fvector);

  return true;
}

Beispiel #2

0

Datei anzeigen

Datei: dictionary_rewriter.cpp Projekt: misonojapan/nvdajpmiscdep

bool RewritePattern::set_pattern(const char *src,
                                 const char *dst) {
  char buf[BUF_SIZE];
  spat_.clear();
  dpat_.clear();

  std::strncpy(buf, src, sizeof(buf));
  tokenizeCSV(buf, back_inserter(spat_), 512);

  std::strncpy(buf, dst, sizeof(buf));
  tokenizeCSV(buf, back_inserter(dpat_), 512);

  return (spat_.size() && dpat_.size());
}

Beispiel #3

0

Datei anzeigen

Datei: eval.cpp Projekt: corestrike/CreateDBForSR4GMTerm

  static bool read(std::istream *is,
                   std::vector<std::vector<std::string> > *r,
                   const std::vector<int> &level) {
    if (!*is) return false;
    char buf[BUF_SIZE];
    char *col[2];
    char *cvs[BUF_SIZE];
    r->clear();
    while (is->getline(buf, sizeof(buf))) {
      if (std::strcmp(buf, "EOS") == 0) break;
      CHECK_DIE(tokenize(buf, "\t", col,  2) == 2) << "format error";
      cvs[0] = col[0];
      size_t n = tokenizeCSV(col[1], cvs + 1, sizeof(cvs) - 1);
      std::vector<std::string> tmp;
      for (size_t i = 0; i < level.size(); ++i) {
        size_t m = level[i] < 0 ? n : level[i];
        CHECK_DIE(m <= n) << " out of range " << level[i];
        std::string output;
        for (size_t j = 0; j <= m; ++j) {
          output += cvs[j];
          if (j != 0) output += "\t";
        }
        tmp.push_back(output);
      }
      r->push_back(tmp);
    }

    return true;
  }

Beispiel #4

0

Datei anzeigen

Datei: dictionary_rewriter.cpp Projekt: misonojapan/nvdajpmiscdep

int POSIDGenerator::id(const char *feature) const {
  char buf[BUF_SIZE];
  char *col[BUF_SIZE];
  CHECK_DIE(std::strlen(feature) < sizeof(buf) - 1) << "too long feature";
  std::strncpy(buf, feature, sizeof(buf) - 1);
  const size_t n = tokenizeCSV(buf, col, sizeof(col));
  CHECK_DIE(n < sizeof(col)) << "too long CSV entities";
  std::string tmp;
  if (!rewrite_.rewrite(n, const_cast<const char **>(col), &tmp))
    return -1;
  return std::atoi(tmp.c_str());
}

Beispiel #5

0

Datei anzeigen

Datei: dictionary_rewriter.cpp Projekt: misonojapan/nvdajpmiscdep

// without cache
bool DictionaryRewriter::rewrite(const std::string &feature,
                                 std::string *ufeature,
                                 std::string *lfeature,
                                 std::string *rfeature) const {
  char buf[BUF_SIZE];
  char *col[BUF_SIZE];
  CHECK_DIE(feature.size() < sizeof(buf) - 1) << "too long feature";
  std::strncpy(buf, feature.c_str(), sizeof(buf) - 1);
  size_t n = tokenizeCSV(buf, col, sizeof(col));
  CHECK_DIE(n < sizeof(col)) << "too long CSV entities";
  return (unigram_rewrite_.rewrite(n, const_cast<const char **>(col),
                                   ufeature) &&
          left_rewrite_.rewrite(n, const_cast<const char **>(col),
                                lfeature) &&
          right_rewrite_.rewrite(n, const_cast<const char **>(col),
                                 rfeature));
}

Beispiel #6

0

Datei anzeigen

Datei: feature_index.cpp Projekt: STRatANG/MMDAgentExperimentEnvironment

bool FeatureIndex::buildUnigramFeature(LearnerPath *path,
                                       const char *ufeature) {
  char ubuf[BUFSIZE];
  char *F[POSSIZE];

  feature_.clear();
  std::strncpy(ubuf, ufeature, BUFSIZE);
  size_t usize = tokenizeCSV(ubuf, F, POSSIZE);

  for (std::vector<const char*>::const_iterator it = unigram_templs_.begin();
       it != unigram_templs_.end(); ++it) {
    const char *p = *it;
    os_.clear();

    for (; *p; p++) {
      switch (*p) {
        default: os_ << *p; break;
        case '\\': os_ << getEscapedChar(*++p); break;
        case '%': {
          switch (*++p) {
            case 'F':  {
              const char *r = getIndex(const_cast<char **>(&p), F, usize);
              if (!r) goto NEXT;
              os_ << r;
            } break;
            case 't':  os_ << (size_t)path->rnode->char_type;     break;
            case 'u':  os_ << ufeature; break;
            default:
              CHECK_FALSE(false) << "unkonwn meta char: " <<  *p;
          }
        }
      }
    }

    os_ << '\0';
    ADDB(os_.str());

 NEXT: continue;
  }

  COPY_FEATURE(path->rnode->fvector);

  return true;
}

Beispiel #7

0

Datei anzeigen

Datei: dictionary_generator.cpp Projekt: corestrike/CreateDBForSR4GMTerm

 static void gencid(const char *filename,
                    DictionaryRewriter *rewrite,
                    ContextID *cid) {
   std::ifstream ifs(filename);
   CHECK_DIE(ifs) << "no such file or directory: " << filename;
   char line[BUF_SIZE];
   std::cout << "reading " << filename << " ... " << std::flush;
   size_t num = 0;
   std::string feature, ufeature, lfeature, rfeature;
   char *col[8];
   while (ifs.getline(line, sizeof(line))) {
     const size_t n = tokenizeCSV(line, col, 5);
     CHECK_DIE(n == 5) << "format error: " << line;
     feature = col[4];
     rewrite->rewrite2(feature, &ufeature, &lfeature, &rfeature);
     cid->add(lfeature.c_str(), rfeature.c_str());
     ++num;
   }
   std::cout << num << std::endl;
   ifs.close();
 }

Beispiel #8

0

Datei anzeigen

Datei: eval.cpp Projekt: chagge/libopenjtalk

  static bool read(std::istream *is,
                   std::vector<std::vector<std::string> > *r,
                   const std::vector<int> &level) {
    if (!*is) {
      return false;
    }

    char *col[2];
    scoped_fixed_array<char, BUF_SIZE> buf;
    scoped_fixed_array<char *, BUF_SIZE> csv;
    r->clear();
    while (is->getline(buf.get(), buf.size())) {
      if (std::strcmp(buf.get(), "EOS") == 0) {
        break;
      }
      CHECK_DIE(tokenize(buf.get(), "\t", col,  2) == 2) << "format error";
      csv[0] = col[0];
      size_t n = tokenizeCSV(col[1], csv.get() + 1, csv.size() - 1);
      std::vector<std::string> tmp;
      for (size_t i = 0; i < level.size(); ++i) {
        size_t m = level[i] < 0 ? n : level[i];
        CHECK_DIE(m <= n) << " out of range " << level[i];
        std::string output;
        for (size_t j = 0; j <= m; ++j) {
          output += csv[j];
          if (j != 0) {
            output += "\t";
          }
        }
        tmp.push_back(output);
      }
      r->push_back(tmp);
    }

    return true;
  }

Beispiel #9

0

Datei anzeigen

Datei: tokenizer.cpp Projekt: MaxMEllon/node-openjtalk

bool Tokenizer<N, P>::open(const Param &param) {
  close();

  const std::string prefix = param.template get<std::string>("dicdir");

  CHECK_FALSE(unkdic_.open(create_filename
                                 (prefix, UNK_DIC_FILE).c_str()))
      << unkdic_.what();
  CHECK_FALSE(property_.open(param)) << property_.what();

  Dictionary *sysdic = new Dictionary;

  CHECK_FALSE(sysdic->open
                    (create_filename(prefix, SYS_DIC_FILE).c_str()))
      << sysdic->what();

  CHECK_FALSE(sysdic->type() == 0)
      << "not a system dictionary: " << prefix;

  property_.set_charset(sysdic->charset());
  dic_.push_back(sysdic);

  const std::string userdic = param.template get<std::string>("userdic");
  if (!userdic.empty()) {
    scoped_fixed_array<char, BUF_SIZE> buf;
    scoped_fixed_array<char *, BUF_SIZE> dicfile;
    std::strncpy(buf.get(), userdic.c_str(), buf.size());
    const size_t n = tokenizeCSV(buf.get(), dicfile.get(), dicfile.size());
    for (size_t i = 0; i < n; ++i) {
      Dictionary *d = new Dictionary;
      CHECK_FALSE(d->open(dicfile[i])) << d->what();
      CHECK_FALSE(d->type() == 1)
          << "not a user dictionary: " << dicfile[i];
      CHECK_FALSE(sysdic->isCompatible(*d))
          << "incompatible dictionary: " << dicfile[i];
      dic_.push_back(d);
    }
  }

  dictionary_info_ = 0;
  dictionary_info_freelist_.free();
  for (int i = static_cast<int>(dic_.size() - 1); i >= 0; --i) {
    DictionaryInfo *d = dictionary_info_freelist_.alloc();
    d->next          = dictionary_info_;
    d->filename      = dic_[i]->filename();
    d->charset       = dic_[i]->charset();
    d->size          = dic_[i]->size();
    d->lsize         = dic_[i]->lsize();
    d->rsize         = dic_[i]->rsize();
    d->type          = dic_[i]->type();
    d->version       = dic_[i]->version();
    dictionary_info_ = d;
  }

  unk_tokens_.clear();
  for (size_t i = 0; i < property_.size(); ++i) {
    const char *key = property_.name(i);
    const Dictionary::result_type n = unkdic_.exactMatchSearch(key);
    CHECK_FALSE(n.value != -1) << "cannot find UNK category: " << key;
    const Token *token = unkdic_.token(n);
    size_t size = unkdic_.token_size(n);
    unk_tokens_.push_back(std::make_pair(token, size));
  }

  space_ = property_.getCharInfo(0x20);  // ad-hoc

  bos_feature_.reset_string(param.template get<std::string>("bos-feature"));

  const std::string tmp = param.template get<std::string>("unk-feature");
  unk_feature_.reset(0);
  if (!tmp.empty()) {
    unk_feature_.reset_string(tmp);
  }

  CHECK_FALSE(*bos_feature_ != '\0')
      << "bos-feature is undefined in dicrc";

  max_grouping_size_ = param.template get<size_t>("max-grouping-size");
  if (max_grouping_size_ == 0) {
    max_grouping_size_ = DEFAULT_MAX_GROUPING_SIZE;
  }

  return true;
}

Beispiel #10

0

Datei anzeigen

Datei: char_property.cpp Projekt: AaronZhangL/mecab

bool CharProperty::compile(const char *cfile,
                           const char *ufile,
                           const char *ofile) {
  scoped_fixed_array<char, BUF_SIZE> line;
  scoped_fixed_array<char *, 512> col;
  size_t id = 0;
  std::vector<Range> range;
  std::map<std::string, CharInfo> category;
  std::vector<std::string> category_ary;
  std::ifstream ifs(WPATH(cfile));
  std::istringstream iss(CHAR_PROPERTY_DEF_DEFAULT);
  std::istream *is = &ifs;

  if (!ifs) {
    std::cerr << cfile
              << " is not found. minimum setting is used" << std::endl;
    is = &iss;
  }

  while (is->getline(line.get(), line.size())) {
    if (std::strlen(line.get()) == 0 || line[0] == '#') {
      continue;
    }
    const size_t size = tokenize2(line.get(), "\t ", col.get(), col.size());
    CHECK_DIE(size >= 2) << "format error: " << line.get();

    // 0xFFFF..0xFFFF hoge hoge hgoe #
    if (std::strncmp(col[0], "0x", 2) == 0) {
      std::string low = col[0];
      std::string high;
      size_t pos = low.find("..");

      if (pos != std::string::npos) {
        high = low.substr(pos + 2, low.size() - pos - 2);
        low  = low.substr(0, pos);
      } else {
        high = low;
      }

      Range r;
      r.low = atohex(low.c_str());
      r.high = atohex(high.c_str());

      CHECK_DIE(r.low >= 0 && r.low < 0xffff &&
                r.high >= 0 && r.high < 0xffff &&
                r.low <= r.high)
          << "range error: low=" << r.low << " high=" << r.high;

      for (size_t i = 1; i < size; ++i) {
        if (col[i][0] == '#') {
          break;  // skip comments
        }
        CHECK_DIE(category.find(std::string(col[i])) != category.end())
            << "category [" << col[i] << "] is undefined";
        r.c.push_back(col[i]);
      }
      range.push_back(r);
    } else {
      CHECK_DIE(size >= 4) << "format error: " << line.get();

      std::string key = col[0];
      CHECK_DIE(category.find(key) == category.end())
          << "category " << key << " is already defined";

      CharInfo c;
      std::memset(&c, 0, sizeof(c));
      c.invoke  = std::atoi(col[1]);
      c.group   = std::atoi(col[2]);
      c.length  = std::atoi(col[3]);
      c.default_type = id++;

      category.insert(std::pair<std::string, CharInfo>(key, c));
      category_ary.push_back(key);
    }
  }

  CHECK_DIE(category.size() < 18) << "too many categories(>= 18)";

  CHECK_DIE(category.find("DEFAULT") != category.end())
      << "category [DEFAULT] is undefined";

  CHECK_DIE(category.find("SPACE") != category.end())
      << "category [SPACE] is undefined";

  std::istringstream iss2(UNK_DEF_DEFAULT);
  std::ifstream ifs2(WPATH(ufile));
  std::istream *is2 = &ifs2;

  if (!ifs2) {
    std::cerr << ufile
              << " is not found. minimum setting is used." << std::endl;
    is2 = &iss2;
  }

  std::set<std::string> unk;
  while (is2->getline(line.get(), line.size())) {
    const size_t n = tokenizeCSV(line.get(), col.get(), 2);
    CHECK_DIE(n >= 1) << "format error: " << line.get();
    const std::string key = col[0];
    CHECK_DIE(category.find(key) != category.end())
        << "category [" << key << "] is undefined in " << cfile;
    unk.insert(key);
  }

  for (std::map<std::string, CharInfo>::const_iterator it = category.begin();
       it != category.end();
       ++it) {
    CHECK_DIE(unk.find(it->first) != unk.end())
        << "category [" << it->first << "] is undefined in " << ufile;
  }

  std::vector<CharInfo> table(0xffff);
  {
    std::vector<std::string> tmp;
    tmp.push_back("DEFAULT");
    const CharInfo c = encode(tmp, &category);
    std::fill(table.begin(), table.end(), c);
  }

  for (std::vector<Range>::const_iterator it = range.begin();
       it != range.end();
       ++it) {
    const CharInfo c = encode(it->c, &category);
    for (int i = it->low; i <= it->high; ++i) {
      table[i] = c;
    }
  }

  // output binary table
  {
    std::ofstream ofs(WPATH(ofile), std::ios::binary|std::ios::out);
    CHECK_DIE(ofs) << "permission denied: " << ofile;

    unsigned int size = static_cast<unsigned int>(category.size());
    ofs.write(reinterpret_cast<const char*>(&size), sizeof(size));
    for (std::vector<std::string>::const_iterator it = category_ary.begin();
         it != category_ary.end();
         ++it) {
      char buf[32];
      std::fill(buf, buf + sizeof(buf), '\0');
      std::strncpy(buf, it->c_str(), sizeof(buf) - 1);
      ofs.write(reinterpret_cast<const char*>(buf), sizeof(buf));
    }
    ofs.write(reinterpret_cast<const char*>(&table[0]),
              sizeof(CharInfo) * table.size());
    ofs.close();
  }

  return true;
}

Beispiel #11

0

Datei anzeigen

Datei: writer.cpp Projekt: SavantCat/openjtalk-training

bool Writer::writeNode(Lattice *lattice,
                       const char *p,
                       const Node *node,
                       StringBuffer *os) const {
  scoped_fixed_array<char, BUF_SIZE> buf;
  scoped_fixed_array<char *, 64> ptr;
  size_t psize = 0;

  for (; *p; p++) {
    switch (*p) {
      default: *os << *p; break;

      case '\\': *os << getEscapedChar(*++p); break;

      case '%': {  // macros
        switch (*++p) {
          default: {
            const std::string error = "unknown meta char: " + *p;
            lattice->set_what(error.c_str());
            return false;
          }
            // input sentence
          case 'S': os->write(lattice->sentence(), lattice->size()); break;
            // sentence length
          case 'L': *os << lattice->size(); break;
            // morph
          case 'm': os->write(node->surface, node->length); break;
          case 'M': os->write(reinterpret_cast<const char *>
                              (node->surface - node->rlength + node->length),
                              node->rlength);
            break;
          case 'h': *os << node->posid; break;  // Part-Of-Speech ID
          case '%': *os << '%'; break;         // %
          case 'c': *os << static_cast<int>(node->wcost); break;  // word cost
          case 'H': *os << node->feature; break;
          case 't': *os << static_cast<unsigned int>(node->char_type); break;
          case 's': *os << static_cast<unsigned int>(node->stat); break;
          case 'P': *os << node->prob; break;
          case 'p': {
            switch (*++p) {
              default:
                lattice->set_what("[iseSCwcnblLh] is required after %p");
                return false;
              case 'i': *os << node->id; break;  // node id
              case 'S': os->write(reinterpret_cast<const char*>
                                  (node->surface -
                                   node->rlength + node->length),
                                  node->rlength - node->length);
                break;  // space
                // start position
              case 's': *os << static_cast<int>(
                  node->surface - lattice->sentence());
                break;
                // end position
              case 'e': *os << static_cast<int>
                    (node->surface - lattice->sentence() + node->length);
                break;
                // connection cost
              case 'C': *os << node->cost -
                    node->prev->cost - node->wcost;
                break;
              case 'w': *os << node->wcost; break;  // word cost
              case 'c': *os << node->cost; break;  // best cost
              case 'n': *os << (node->cost - node->prev->cost); break;
                // node cost
                // * if best path, otherwise ' '
              case 'b': *os << (node->isbest ? '*' : ' '); break;
              case 'P': *os << node->prob; break;
              case 'A': *os << node->alpha; break;
              case 'B': *os << node->beta; break;
              case 'l': *os << node->length; break;  // length of morph
                // length of morph including the spaces
              case 'L': *os << node->rlength;    break;
              case 'h': {  // Hidden Layer ID
                switch (*++p) {
                  default:
                    lattice->set_what("lr is required after %ph");
                    return false;
                  case 'l': *os << node->lcAttr; break;   // current
                  case 'r': *os << node->rcAttr; break;   // prev
                }
              } break;

              case 'p': {
                char mode = *++p;
                char sep = *++p;
                if (sep == '\\') {
                  sep = getEscapedChar(*++p);
                }
                if (!node->lpath) {
                  lattice->set_what("no path information is available");
                  return false;
                }
                for (Path *path = node->lpath; path; path = path->lnext) {
                  if (path != node->lpath) *os << sep;
                  switch (mode) {
                    case 'i': *os << path->lnode->id; break;
                    case 'c': *os << path->cost; break;
                    case 'P': *os << path->prob; break;
                    default:
                      lattice->set_what("[icP] is required after %pp");
                      return false;
                  }
                }
              } break;

            }
          } break;

          case 'F':
          case 'f': {
            if (node->feature[0] == '\0') {
              lattice->set_what("no feature information available");
              return false;
            }
            if (!psize) {
				strncpy_s(buf.get(), sizeof(buf.get()), node->feature, buf.size());
              psize = tokenizeCSV(buf.get(), ptr.get(), ptr.size());
            }

            // separator
            char separator = '\t';  // default separator
            if (*p == 'F') {  // change separator
              if (*++p == '\\') {
                separator = getEscapedChar(*++p);
              } else {
                separator = *p;
              }
            }

            if (*++p !='[') {
              lattice->set_what("cannot find '['");
              return false;
            }
            size_t n = 0;
            bool sep = false;
            bool isfil = false;
            p++;

            for (;; ++p) {
              switch (*p) {
                case '0': case '1': case '2': case '3': case '4':
                case '5': case '6': case '7': case '8': case '9':
                  n = 10 * n +(*p - '0');
                  break;
                case ',': case ']':
                  if (n >= psize) {
                    lattice->set_what("given index is out of range");
                    return false;
                  }
                  isfil = (ptr[n][0] != '*');
                  if (isfil) {
                    if (sep) {
                      *os << separator;
                    }
                    *os << ptr[n];
                  }
                  if (*p == ']') {
                    goto last;
                  }
                  sep = isfil;
                  n = 0;
                  break;
                default:
                  lattice->set_what("cannot find ']'");
                  return false;
              }
            }
          } last: break;
        }  // end switch
      } break;  // end case '%'
    }  // end switch
  }

  return true;
}

Beispiel #12

0

Datei anzeigen

Datei: dictionary_generator.cpp Projekt: corestrike/CreateDBForSR4GMTerm

  static void gendic(const char* ifile,
                     const char* ofile,
                     const CharProperty &property,
                     DictionaryRewriter *rewrite,
                     const ContextID &cid,
                     DecoderFeatureIndex *fi,
                     bool unk,
                     int factor,
                     int default_cost) {
    std::ifstream ifs(ifile);
    CHECK_DIE(ifs) << "no such file or directory: " << ifile;

    std::ofstream ofs(ofile);
    CHECK_DIE(ofs) << "permission denied: " << ofile;

    std::string w, feature, ufeature, lfeature, rfeature;
    int cost, lid, rid;

    std::cout <<  "emitting " << ofile << " ... " << std::flush;

    LearnerPath path;
    LearnerNode rnode;
    LearnerNode lnode;
    rnode.stat  = lnode.stat = MECAB_NOR_NODE;
    rnode.rpath = &path;
    lnode.lpath = &path;
    path.lnode  = &lnode;
    path.rnode  = &rnode;

    char line[BUF_SIZE];
    char *col[8];
    size_t num = 0;

    while (ifs.getline(line, sizeof(line))) {
      const size_t n = tokenizeCSV(line, col, 5);
      CHECK_DIE(n == 5) << "format error: " << line;

      w = std::string(col[0]);
      lid = std::atoi(col[1]);
      rid = std::atoi(col[2]);
      cost = std::atoi(col[3]);
      feature = std::string(col[4]);

      rewrite->rewrite2(feature, &ufeature, &lfeature, &rfeature);
      lid = cid.lid(lfeature.c_str());
      rid = cid.rid(rfeature.c_str());

      CHECK_DIE(lid > 0) << "CID is not found for " << lfeature;
      CHECK_DIE(rid > 0) << "CID is not found for " << rfeature;

      if (unk) {
        int c = property.id(w.c_str());
        CHECK_DIE(c >= 0) << "unknown property [" << w << "]";
        path.rnode->char_type = (unsigned char)c;
      } else {
        size_t mblen;
        CharInfo cinfo = property.getCharInfo(w.c_str(),
                                              w.c_str() + w.size(), &mblen);
        path.rnode->char_type = cinfo.default_type;
      }

      fi->buildUnigramFeature(&path, ufeature.c_str());
      fi->calcCost(&rnode);
      CHECK_DIE(escape_csv_element(&w)) << "invalid character found: " << w;

      ofs << w << ',' << lid << ',' << rid << ','
          << tocost(rnode.wcost, factor, default_cost)
          << ',' << feature << std::endl;
      ++num;
    }

    std::cout << num << std::endl;
  }

Beispiel #13

0

Datei anzeigen

Datei: dictionary.cpp Projekt: STRatANG/MMDAgentExperimentEnvironment

bool Dictionary::compile(const Param &param,
                         const std::vector<std::string> &dics,
                         const char *matrix_file,
                         const char *matrix_bin_file,
                         const char *left_id_file,
                         const char *right_id_file,
                         const char *rewrite_file,
                         const char *pos_id_file,
                         const char *output) {
  Connector matrix;
  scoped_ptr<DictionaryRewriter> rewrite(0);
  scoped_ptr<POSIDGenerator> posid(0);
  scoped_ptr<ContextID> cid(0);
  scoped_ptr<Writer> writer(0);
  scoped_ptr<StringBuffer> os(0);
  Node node;

  std::vector<std::pair<std::string, Token*> > dic;

  size_t offset  = 0;
  unsigned int lexsize = 0;
  std::string w, feature, ufeature, lfeature, rfeature, fbuf, key;
  int lid, rid, cost;

  const std::string from = param.get<std::string>("dictionary-charset");
  const std::string to = param.get<std::string>("charset");
  const bool wakati = param.get<bool>("wakati");
  const int type = param.get<int>("type");
  const std::string node_format = param.get<std::string>("node-format");

  // for backward compatibility
  std::string config_charset = param.get<std::string>("config-charset");
  if (config_charset.empty()) config_charset = from;

  CHECK_DIE(!from.empty()) << "input dictionary charset is empty";
  CHECK_DIE(!to.empty())   << "output dictionary charset is empty";

  Iconv iconv;
  CHECK_DIE(iconv.open(from.c_str(), to.c_str()))
      << "iconv_open() failed with from=" << from << " to=" << to;

  Iconv config_iconv;
  CHECK_DIE(config_iconv.open(config_charset.c_str(), from.c_str()))
      << "iconv_open() failed with from=" << config_charset << " to=" << from;

  if (!node_format.empty()) {
    writer.reset(new Writer);
    os.reset(new StringBuffer);
    memset(&node, 0, sizeof(node));
  }

  if (!matrix.openText(matrix_file) &&
      !matrix.open(matrix_bin_file)) {
    matrix.set_left_size(1);
    matrix.set_right_size(1);
  }

  posid.reset(new POSIDGenerator);
  posid->open(pos_id_file, &config_iconv);

  std::istringstream iss(UNK_DEF_DEFAULT);

  for (size_t i = 0; i < dics.size(); ++i) {
    std::ifstream ifs(dics[i].c_str());
    std::istream *is = &ifs;
    if (!ifs) {
      if (type == MECAB_UNK_DIC) {
        std::cerr << dics[i]
                  << " is not found. minimum setting is used." << std::endl;
        is = &iss;
      } else {
        CHECK_DIE(ifs) << "no such file or directory: " << dics[i];
      }
    }

    std::cout << "reading " << dics[i] << " ... ";

    char line[BUF_SIZE];
    size_t num = 0;

    while (is->getline(line, sizeof(line))) {
      char *col[8];
      const size_t n = tokenizeCSV(line, col, 5);
      CHECK_DIE(n == 5) << "format error: " << line;

      w = col[0];
      lid = std::atoi(col[1]);
      rid = std::atoi(col[2]);
      cost = std::atoi(col[3]);
      feature = col[4];
      int pid = posid->id(feature.c_str());

      if (lid < 0  || rid < 0) {
        if (!rewrite.get()) {
          rewrite.reset(new DictionaryRewriter);
          rewrite->open(rewrite_file, &config_iconv);
        }

        CHECK_DIE(rewrite->rewrite(feature, &ufeature, &lfeature, &rfeature))
            << "rewrite failed: " << feature;

        if (!cid.get()) {
          cid.reset(new ContextID);
          cid->open(left_id_file, right_id_file, &config_iconv);
          CHECK_DIE(cid->left_size()  == matrix.left_size() &&
                    cid->right_size() == matrix.right_size())
              << "Context ID files("
              << left_id_file
              << " or "
              << right_id_file << " may be broken";
        }

        lid = cid->lid(lfeature.c_str());
        rid = cid->rid(rfeature.c_str());
      }

      CHECK_DIE(lid >= 0 && rid >= 0 && matrix.is_valid(lid, rid))
          << "invalid ids are found lid=" << lid << " rid=" << rid;

      if (w.empty()) {
        std::cerr << "empty word is found, discard this line" << std::endl;
        continue;
      }

      if (!iconv.convert(&feature)) {
        std::cerr << "iconv conversion failed. skip this entry"
                  << std::endl;
        continue;
      }

      if (type != MECAB_UNK_DIC && !iconv.convert(&w)) {
        std::cerr << "iconv conversion failed. skip this entry"
                  << std::endl;
        continue;
      }

      if (!node_format.empty()) {
        node.surface = w.c_str();
        node.feature = feature.c_str();
        node.length  = w.size();
        node.rlength = w.size();
        node.posid   = pid;
        node.stat    = MECAB_NOR_NODE;
        CHECK_DIE(os.get());
        CHECK_DIE(writer.get());
        os->clear();
        CHECK_DIE(writer->writeNode(&*os,
                                    node_format.c_str(),
                                    w.c_str(),
                                    &node)) <<
            "conversion error: " << feature << " with " << node_format;
        *os << '\0';
        feature = os->str();
      }

      key.clear();
      if (!wakati) key = feature + '\0';

      Token* token  = new Token;
      token->lcAttr = lid;
      token->rcAttr = rid;
      token->posid  = pid;
      token->wcost = cost;
      token->feature = offset;
      token->compound = 0;
      dic.push_back(std::make_pair<std::string, Token*>(w, token));

      // append to output buffer
      if (!wakati) fbuf.append(key.data(), key.size());
      offset += key.size();

      ++num;
      ++lexsize;
    }

    std::cout << num << std::endl;
  }

  if (wakati) fbuf.append("\0", 1);

  std::sort(dic.begin(), dic.end());

  size_t bsize = 0;
  size_t idx = 0;
  std::string prev;
  std::vector<const char *> str;
  std::vector<size_t> len;
  std::vector<Darts::DoubleArray::result_type> val;

  for (size_t i = 0; i < dic.size(); ++i) {
    if (i != 0 && prev != dic[i].first) {
      str.push_back(dic[idx].first.c_str());
      len.push_back(dic[idx].first.size());
      val.push_back(bsize +(idx << 8));
      bsize = 1;
      idx = i;
    } else {
      ++bsize;
    }
    prev = dic[i].first;
  }
  str.push_back(dic[idx].first.c_str());
  len.push_back(dic[idx].first.size());
  val.push_back(bsize +(idx << 8));

  CHECK_DIE(str.size() == len.size());
  CHECK_DIE(str.size() == val.size());

  Darts::DoubleArray da;
  CHECK_DIE(da.build(str.size(), const_cast<char **>(&str[0]),
                     &len[0], &val[0], &progress_bar_darts) == 0)
      << "unkown error in building double-array";

  std::string tbuf;
  for (size_t i = 0; i < dic.size(); ++i) {
    tbuf.append(reinterpret_cast<const char*>(dic[i].second),
                sizeof(Token));
    delete dic[i].second;
  }
  dic.clear();

  // needs to be 8byte(64bit) aligned
  while (tbuf.size() % 8 != 0) {
    Token dummy;
    memset(&dummy, 0, sizeof(Token));
    tbuf.append(reinterpret_cast<const char*>(&dummy), sizeof(Token));
  }

  unsigned int dummy = 0;
  unsigned int lsize = matrix.left_size();
  unsigned int rsize = matrix.right_size();
  unsigned int dsize = da.unit_size() * da.size();
  unsigned int tsize = tbuf.size();
  unsigned int fsize = fbuf.size();

  unsigned int version = DIC_VERSION;
  char charset[32];
  std::fill(charset, charset + sizeof(charset), '\0');
  std::strncpy(charset, to.c_str(), 31);

  std::ofstream bofs(output, std::ios::binary|std::ios::out);
  CHECK_DIE(bofs) << "permission denied: " << output;

  unsigned int magic = 0;

  // needs to be 64bit aligned
  // 10*32 = 64*5
  bofs.write(reinterpret_cast<const char *>(&magic),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&version), sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&type),    sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&lexsize), sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&lsize),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&rsize),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&dsize),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&tsize),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&fsize),   sizeof(unsigned int));
  bofs.write(reinterpret_cast<const char *>(&dummy),   sizeof(unsigned int));

  // 32 * 8 = 64 * 4
  bofs.write(reinterpret_cast<const char *>(charset),  sizeof(charset));

  bofs.write(reinterpret_cast<const char*>(da.array()),
             da.unit_size() * da.size());
  bofs.write(const_cast<const char *>(tbuf.data()), tbuf.size());
  bofs.write(const_cast<const char *>(fbuf.data()), fbuf.size());

  // save magic id
  magic = static_cast<unsigned int>(bofs.tellp());
  magic ^= DictionaryMagicID;
  bofs.seekp(0);
  bofs.write(reinterpret_cast<const char *>(&magic), sizeof(unsigned int));

  bofs.close();

  return true;
}