C++ (Cpp) SequenceInstance::GetForm примеры использования

Язык программирования: C++ (Cpp)

Класс/Тип: SequenceInstance

Метод/Функция: GetForm

Примеров на hotexamples.com: 2

C++ (Cpp) SequenceInstance::GetForm - 2 примера найдено. Это лучшие примеры C++ (Cpp) кода для SequenceInstance::GetForm, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Основные методы

Показать Скрыть

GetForm(2)

size(2)

GetTag(1)

Initialize(1)

Основные методы

GetForm (2)

size (2)

GetTag (1)

Initialize (1)

Пример #1

Показать файл

Файл: TokenDictionary.cpp Проект: davidlegit/TurboParser

void TokenDictionary::InitializeFromSequenceReader(SequenceReader *reader) {
  LOG(INFO) << "Creating token dictionary...";

  int form_cutoff = FLAGS_form_cutoff;
  int form_lower_cutoff = FLAGS_form_cutoff;
  int shape_cutoff = 0;
  int prefix_length = FLAGS_prefix_length;
  int suffix_length = FLAGS_suffix_length;
  bool form_case_sensitive = FLAGS_form_case_sensitive;

  std::vector<int> form_freqs;
  std::vector<int> form_lower_freqs;
  std::vector<int> shape_freqs;
  Alphabet form_alphabet;
  Alphabet form_lower_alphabet;
  Alphabet shape_alphabet;

  std::string special_symbols[NUM_SPECIAL_TOKENS];
  special_symbols[TOKEN_UNKNOWN] = kTokenUnknown;
  special_symbols[TOKEN_START] = kTokenStart;
  special_symbols[TOKEN_STOP] = kTokenStop;

  for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
    prefix_alphabet_.Insert(special_symbols[i]);
    suffix_alphabet_.Insert(special_symbols[i]);
    form_alphabet.Insert(special_symbols[i]);
    form_lower_alphabet.Insert(special_symbols[i]);
    shape_alphabet.Insert(special_symbols[i]);

    // Counts of special symbols are set to -1:
    form_freqs.push_back(-1);
    form_lower_freqs.push_back(-1);
    shape_freqs.push_back(-1);
  }

  // Go through the corpus and build the dictionaries,
  // counting the frequencies.
  reader->Open(pipe_->GetOptions()->GetTrainingFilePath());
  SequenceInstance *instance =
    static_cast<SequenceInstance*>(reader->GetNext());
  while (instance != NULL) {
    int instance_length = instance->size();
    for (int i = 0; i < instance_length; ++i) {
      int id;

      // Add form to alphabet.
      std::string form = instance->GetForm(i);
      std::string form_lower(form);
      transform(form_lower.begin(), form_lower.end(), form_lower.begin(),
                ::tolower);
      if (!form_case_sensitive) form = form_lower;
      id = form_alphabet.Insert(form);
      if (id >= form_freqs.size()) {
        CHECK_EQ(id, form_freqs.size());
        form_freqs.push_back(0);
      }
      ++form_freqs[id];

      // Add lower-case form to the alphabet.
      id = form_lower_alphabet.Insert(form_lower);
      if (id >= form_lower_freqs.size()) {
        CHECK_EQ(id, form_lower_freqs.size());
        form_lower_freqs.push_back(0);
      }
      ++form_lower_freqs[id];

      // Add prefix/suffix to alphabet.
      std::string prefix = form.substr(0, prefix_length);
      id = prefix_alphabet_.Insert(prefix);
      int start = form.length() - suffix_length;
      if (start < 0) start = 0;
      std::string suffix = form.substr(start, suffix_length);
      id = suffix_alphabet_.Insert(suffix);

      // Add shape to alphabet.
      std::string shape;
      GetWordShape(instance->GetForm(i), &shape);
      id = shape_alphabet.Insert(shape);
      if (id >= shape_freqs.size()) {
        CHECK_EQ(id, shape_freqs.size());
        shape_freqs.push_back(0);
      }
      ++shape_freqs[id];
    }
    delete instance;
    instance = static_cast<SequenceInstance*>(reader->GetNext());
  }
  reader->Close();

  // Now adjust the cutoffs if necessary.
  while (true) {
    form_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      form_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = form_alphabet.begin();
         iter != form_alphabet.end();
         ++iter) {
      if (form_freqs[iter->second] > form_cutoff) {
        form_alphabet_.Insert(iter->first);
      }
    }
    if (form_alphabet_.size() < kMaxFormAlphabetSize) break;
    ++form_cutoff;
    LOG(INFO) << "Incrementing form cutoff to " << form_cutoff << "...";
  }

  while (true) {
    form_lower_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      form_lower_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = form_lower_alphabet.begin();
         iter != form_lower_alphabet.end();
         ++iter) {
      if (form_lower_freqs[iter->second] > form_lower_cutoff) {
        form_lower_alphabet_.Insert(iter->first);
      }
    }
    if (form_lower_alphabet_.size() < kMaxFormAlphabetSize) break;
    ++form_lower_cutoff;
    LOG(INFO) << "Incrementing lower-case form cutoff to "
              << form_lower_cutoff << "...";
  }

  while (true) {
    shape_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      shape_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = shape_alphabet.begin();
         iter != shape_alphabet.end();
         ++iter) {
      if (shape_freqs[iter->second] > shape_cutoff) {
        shape_alphabet_.Insert(iter->first);
      }
    }
    if (shape_alphabet_.size() < kMaxShapeAlphabetSize) break;
    ++shape_cutoff;
    LOG(INFO) << "Incrementing shape cutoff to " << shape_cutoff << "...";
  }

  form_alphabet_.StopGrowth();
  form_lower_alphabet_.StopGrowth();
  shape_alphabet_.StopGrowth();
  lemma_alphabet_.StopGrowth();
  prefix_alphabet_.StopGrowth();
  suffix_alphabet_.StopGrowth();
  feats_alphabet_.StopGrowth();
  pos_alphabet_.StopGrowth();
  cpos_alphabet_.StopGrowth();

  LOG(INFO) << "Number of forms: " << form_alphabet_.size() << endl
            << "Number of prefixes: "
            << prefix_alphabet_.size() << endl
            << "Number of suffixes: "
            << suffix_alphabet_.size() << endl
            << "Number of word shapes: "
            << shape_alphabet_.size();

  CHECK_LT(form_alphabet_.size(), 0xffff);
  CHECK_LT(form_lower_alphabet_.size(), 0xffff);
  CHECK_LT(shape_alphabet_.size(), 0xffff);
  CHECK_LT(lemma_alphabet_.size(), 0xffff);
  CHECK_LT(prefix_alphabet_.size(), 0xffff);
  CHECK_LT(suffix_alphabet_.size(), 0xffff);
  CHECK_LT(feats_alphabet_.size(), 0xffff);
  CHECK_LT(pos_alphabet_.size(), 0xff);
  CHECK_LT(cpos_alphabet_.size(), 0xff);
}

Пример #2

Показать файл

Файл: TaggerDictionary.cpp Проект: FranklinChen/TurboParser

void TaggerDictionary::CreateTagDictionary(SequenceReader *reader) {
  SequenceDictionary::CreateTagDictionary(reader);

  LOG(INFO) << "Creating word-tag dictionary...";
  bool form_case_sensitive = FLAGS_form_case_sensitive;

  // Go through the corpus and build the existing tags for each word.
  word_tags_.clear();
  word_tags_.resize(token_dictionary_->GetNumForms());

  reader->Open(pipe_->GetOptions()->GetTrainingFilePath());
  SequenceInstance *instance =
    static_cast<SequenceInstance*>(reader->GetNext());
  while (instance != NULL) {
    int instance_length = instance->size();
    for (int i = 0; i < instance_length; ++i) {
      int id;
      string form = instance->GetForm(i);
      if (!form_case_sensitive) {
        transform(form.begin(), form.end(), form.begin(), ::tolower);
      }
      int word_id = token_dictionary_->GetFormId(form);
      //CHECK_GE(word_id, 0);

      id = tag_alphabet_.Lookup(instance->GetTag(i));
      CHECK_GE(id, 0);

      // Insert new tag in the set of word tags, if it is not there
      // already. NOTE: this is inefficient, maybe we should be using a
      // different data structure.
      if (word_id >= 0) {
        vector<int> &tags = word_tags_[word_id];
        int j;
        for (j = 0; j < tags.size(); ++j) {
          if (tags[j] == id) break;
        }
        if (j == tags.size()) tags.push_back(id);
      }
    }
    delete instance;
    instance = static_cast<SequenceInstance*>(reader->GetNext());
  }
  reader->Close();

  // If there is a list of possible tags for the unknown words, load it.
  TaggerOptions *options =
    static_cast<TaggerOptions*>(pipe_->GetOptions());
  if (options->GetUnknownWordTagsFilePath().size() == 0) {
    for (int i = 0; i < tag_alphabet_.size(); ++i) {
      unknown_word_tags_.push_back(i);
    }
  } else {
    LOG(INFO) << "Loading file with unknown word tags...";
    std::ifstream is;
    is.open(options->GetUnknownWordTagsFilePath().c_str(), ifstream::in);
    CHECK(is.good()) << "Could not open "
      << options->GetUnknownWordTagsFilePath() << ".";
    vector<vector<string> > sentence_fields;
    string line;
    if (is.is_open()) {
      while (!is.eof()) {
        getline(is, line);
        if (line.size() == 0) break;
        int tagid = tag_alphabet_.Lookup(line);
        CHECK(tagid >= 0) << "Tag " << line << " does not exist.";
        unknown_word_tags_.push_back(tagid);
        LOG(INFO) << "Unknown word tag: " << line;
      }
    }
  }
  LOG(INFO) << "Number of unknown word tags: " << unknown_word_tags_.size();
}