bool Dictionary::open(const char *file,
                      const char *mode) {
  filename_ = file;
  MMAP_OPEN(char, dmmap_, filename_, mode);

  CHECK_CLOSE_FALSE(dmmap_->size() >= 100)
      << "dictionary file is broken: " << file;

  const char *ptr = dmmap_->begin();

  unsigned int dsize;
  unsigned int tsize;
  unsigned int fsize;
  unsigned int magic;
  unsigned int dummy;

  read_static<unsigned int>(&ptr, magic);
  CHECK_CLOSE_FALSE((magic ^ DictionaryMagicID) == dmmap_->size())
      << "dictionary file is broken: " << file;

  read_static<unsigned int>(&ptr, version_);
  CHECK_CLOSE_FALSE(version_ == DIC_VERSION)
      << "incompatible version: " << version_;

  read_static<unsigned int>(&ptr, type_);
  read_static<unsigned int>(&ptr, lexsize_);
  read_static<unsigned int>(&ptr, lsize_);
  read_static<unsigned int>(&ptr, rsize_);
  read_static<unsigned int>(&ptr, dsize);
  read_static<unsigned int>(&ptr, tsize);
  read_static<unsigned int>(&ptr, fsize);
  read_static<unsigned int>(&ptr, dummy);

  charset_ = ptr;
  ptr += 32;
  da_.set_array(reinterpret_cast<void *>(const_cast<char*>(ptr)));

  ptr += dsize;

  token_ = reinterpret_cast<const Token *>(ptr);
  ptr += tsize;

  feature_ = ptr;
  ptr += fsize;

  CHECK_CLOSE_FALSE(ptr == dmmap_->end())
      << "dictionary file is broken: " << file;

  return true;
}
Ejemplo n.º 2
0
  bool TaggerImpl::open(Param *param) {
    close();

    if (!param->help_version()) {
      close();
      return false;
    }

    nbest_ = param->get<int>("nbest");
    vlevel_ = param->get<int>("verbose");

    std::string model = param->get<std::string>("model");

    feature_index_ = new DecoderFeatureIndex();
    CHECK_CLOSE_FALSE(feature_index_->open(model.c_str(), 0))
      << feature_index_->what();

    double c = param->get<double>("cost-factor");

    if (c <= 0.0) {
      WHAT << "cost factor must be positive";
      close();
      return false;
    }

    feature_index_->set_cost_factor(c);
    ysize_ = feature_index_->ysize();

    return true;
  }
Ejemplo n.º 3
0
bool TokenizerImpl<N, P>::open(const Param &param) {
  close();

  const std::string prefix = param.template get<std::string>("dicdir");

  const char *mode = param.template get<bool>("open-mutable-dictionary") ?
      "r+" : "r";

  CHECK_CLOSE_FALSE(unkdic_.open(create_filename
                                 (prefix, UNK_DIC_FILE).c_str(), mode))
      << unkdic_.what();
  CHECK_CLOSE_FALSE(property_.open(param)) << property_.what();

  Dictionary *sysdic = new Dictionary;

  CHECK_CLOSE_FALSE(sysdic->open
                    (create_filename(prefix, SYS_DIC_FILE).c_str(), mode))
      << sysdic->what();

  CHECK_CLOSE_FALSE(sysdic->type() == 0)
      << "not a system dictionary: " << prefix;

  property_.set_charset(sysdic->charset());
  dic_.push_back(sysdic);

  const std::string userdic = param.template get<std::string>("userdic");
  if (!userdic.empty()) {
    char buf[BUF_SIZE];
    char *_dic[BUF_SIZE];
    std::strncpy(buf, userdic.c_str(), sizeof(buf));
    size_t n = tokenizeCSV(buf, _dic, sizeof(_dic));
    for (size_t i = 0; i < n; ++i) {
      Dictionary *d = new Dictionary;
      CHECK_CLOSE_FALSE(d->open(_dic[i], mode)) << d->what();
      CHECK_CLOSE_FALSE(d->type() == 1)
          << "not a user dictionary: " << _dic[i];
      CHECK_CLOSE_FALSE(sysdic->isCompatible(*d))
          << "incompatible dictionary: " << _dic[i];
      dic_.push_back(d);
    }
  }

  dictionary_info_ = 0;
  dictionary_info_freelist_.free();
  for (int i = static_cast<int>(dic_.size() - 1); i >= 0; --i) {
    DictionaryInfo *d = dictionary_info_freelist_.alloc();
    d->next = dictionary_info_;
    d->filename = dic_[i]->filename();
    d->charset = dic_[i]->charset();
    d->size = dic_[i]->size();
    d->lsize = dic_[i]->lsize();
    d->rsize = dic_[i]->rsize();
    d->type = dic_[i]->type();
    d->version = dic_[i]->version();
    dictionary_info_ = d;
  }

  unk_tokens_.clear();
  for (size_t i = 0; i < property_.size(); ++i) {
    const char *key = property_.name(i);
    Dictionary::result_type n = unkdic_.exactMatchSearch(key);
    CHECK_CLOSE_FALSE(n.value != -1) << "cannot find UNK category: " << key;
    const Token *token = unkdic_.token(n);
    size_t size  = unkdic_.token_size(n);
    unk_tokens_.push_back(std::make_pair(token, size));
  }

  space_ = property_.getCharInfo(0x20);  // ad-hoc

  bos_feature_.reset_string(param.template get<std::string>("bos-feature"));

  const std::string tmp = param.template get<std::string>("unk-feature");
  unk_feature_.reset(0);
  if (!tmp.empty()) unk_feature_.reset_string(tmp);

  CHECK_CLOSE_FALSE(*bos_feature_ != '\0')
      << "bos-feature is undefined in dicrc";

  max_grouping_size_ = param.template get<size_t>("max-grouping-size");
  if (max_grouping_size_ == 0)
    max_grouping_size_ = DEFAULT_MAX_GROUPING_SIZE;

  return true;
}