void CoreferenceDictionary::CreateWordDictionaries(
  CoreferenceSentenceReader *reader) {
  LOG(INFO) << "Creating word dictionary...";
  std::vector<int> word_freqs;
  std::vector<int> word_lower_freqs;

  /*
  string special_symbols[NUM_SPECIAL_TOKENS];
  special_symbols[TOKEN_UNKNOWN] = kTokenUnknown;
  special_symbols[TOKEN_START] = kTokenStart;
  special_symbols[TOKEN_STOP] = kTokenStop;

  for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
    word_alphabet.Insert(special_symbols[i]);
    word_freqs.push_back(0);
    word_lower_alphabet.Insert(special_symbols[i]);
    word_lower_freqs.push_back(0);
  }
  */

  // Go through the corpus and build the label dictionary,
  // counting the frequencies.
  reader->Open(pipe_->GetOptions()->GetTrainingFilePath());
  CoreferenceSentence *instance =
    static_cast<CoreferenceSentence*>(reader->GetNext());
  while (instance != NULL) {
    for (int i = 0; i < instance->size(); ++i) {
      int id;

      // Add form to alphabet.
      std::string form = instance->GetForm(i);
      std::string form_lower(form);
      transform(form_lower.begin(), form_lower.end(), form_lower.begin(),
                ::tolower);
      id = word_alphabet_.Insert(form);
      if (id >= word_freqs.size()) {
        CHECK_EQ(id, word_freqs.size());
        word_freqs.push_back(0);
      }
      ++word_freqs[id];

      // Add lower-case form to the alphabet.
      // TODO(atm): "sanitize" words, by escaping digit sequences:
      // word = re.sub('[\d]+', '#', word.lower())
      id = word_lower_alphabet_.Insert(form_lower);
      if (id >= word_lower_freqs.size()) {
        CHECK_EQ(id, word_lower_freqs.size());
        word_lower_freqs.push_back(0);
      }
      ++word_lower_freqs[id];
    }
    delete instance;
    instance = static_cast<CoreferenceSentence*>(reader->GetNext());
  }
  reader->Close();
  word_alphabet_.StopGrowth();
  word_lower_alphabet_.StopGrowth();

  LOG(INFO) << "Number of words: " << word_alphabet_.size();
  LOG(INFO) << "Number of lower-case words: " << word_lower_alphabet_.size();
}
void DependencyInstanceNumeric::Initialize(
  const DependencyDictionary &dictionary,
  DependencyInstance* instance) {
  TokenDictionary *token_dictionary = dictionary.GetTokenDictionary();
  int length = instance->size();
  int i;
  int id;

  int prefix_length = FLAGS_prefix_length;
  int suffix_length = FLAGS_suffix_length;
  bool form_case_sensitive = FLAGS_form_case_sensitive;

  Clear();

  form_ids_.resize(length);
  form_lower_ids_.resize(length);
  lemma_ids_.resize(length);
  prefix_ids_.resize(length);
  suffix_ids_.resize(length);
  feats_ids_.resize(length);
  pos_ids_.resize(length);
  cpos_ids_.resize(length);
  //shapes_.resize(length);
  is_noun_.resize(length);
  is_verb_.resize(length);
  is_punc_.resize(length);
  is_coord_.resize(length);
  heads_.resize(length);
  relations_.resize(length);

  for (i = 0; i < length; i++) {
    std::string form = instance->GetForm(i);
    std::string form_lower(form);
    transform(form_lower.begin(), form_lower.end(), form_lower.begin(),
              ::tolower);
    if (!form_case_sensitive) form = form_lower;
    id = token_dictionary->GetFormId(form);
    CHECK_LT(id, 0xffff);
    if (id < 0) id = TOKEN_UNKNOWN;
    form_ids_[i] = id;

    id = token_dictionary->GetFormLowerId(form_lower);
    CHECK_LT(id, 0xffff);
    if (id < 0) id = TOKEN_UNKNOWN;
    form_lower_ids_[i] = id;

    id = token_dictionary->GetLemmaId(instance->GetLemma(i));
    CHECK_LT(id, 0xffff);
    if (id < 0) id = TOKEN_UNKNOWN;
    lemma_ids_[i] = id;

    std::string prefix = form.substr(0, prefix_length);
    id = token_dictionary->GetPrefixId(prefix);
    CHECK_LT(id, 0xffff);
    if (id < 0) id = TOKEN_UNKNOWN;
    prefix_ids_[i] = id;

    int start = form.length() - suffix_length;
    if (start < 0) start = 0;
    std::string suffix = form.substr(start, suffix_length);
    id = token_dictionary->GetSuffixId(suffix);
    CHECK_LT(id, 0xffff);
    if (id < 0) id = TOKEN_UNKNOWN;
    suffix_ids_[i] = id;

    id = token_dictionary->GetPosTagId(instance->GetPosTag(i));
    CHECK_LT(id, 0xff);
    if (id < 0) id = TOKEN_UNKNOWN;
    pos_ids_[i] = id;

    id = token_dictionary->GetCoarsePosTagId(instance->GetCoarsePosTag(i));
    CHECK_LT(id, 0xff);
    if (id < 0) id = TOKEN_UNKNOWN;
    cpos_ids_[i] = id;

    feats_ids_[i].resize(instance->GetNumMorphFeatures(i));
    for (int j = 0; j < instance->GetNumMorphFeatures(i); ++j) {
      id = token_dictionary->GetMorphFeatureId(instance->GetMorphFeature(i, j));
      CHECK_LT(id, 0xffff);
      if (id < 0) id = TOKEN_UNKNOWN;
      feats_ids_[i][j] = id;
    }

    //GetWordShape(instance->GetForm(i), &shapes_[i]);

    // Check whether the word is a noun, verb, punctuation or coordination.
    // Note: this depends on the POS tag string.
    // This procedure is taken from EGSTRA
    // (http://groups.csail.mit.edu/nlp/egstra/).
    is_noun_[i] = false;
    is_verb_[i] = false;
    is_punc_[i] = false;
    is_coord_[i] = false;

    const char* tag = instance->GetPosTag(i).c_str();
    if (tag[0] == 'v' || tag[0] == 'V') {
      is_verb_[i] = true;
    } else if (tag[0] == 'n' || tag[0] == 'N') {
      is_noun_[i] = true;
    } else if (strcmp(tag, "Punc") == 0 ||
               strcmp(tag, "$,") == 0 ||
               strcmp(tag, "$.") == 0 ||
               strcmp(tag, "PUNC") == 0 ||
               strcmp(tag, "punc") == 0 ||
               strcmp(tag, "F") == 0 ||
               strcmp(tag, "IK") == 0 ||
               strcmp(tag, "XP") == 0 ||
               strcmp(tag, ",") == 0 ||
               strcmp(tag, ";") == 0) {
      is_punc_[i] = true;
    } else if (strcmp(tag, "Conj") == 0 ||
               strcmp(tag, "KON") == 0 ||
               strcmp(tag, "conj") == 0 ||
               strcmp(tag, "Conjunction") == 0 ||
               strcmp(tag, "CC") == 0 ||
               strcmp(tag, "cc") == 0) {
      is_coord_[i] = true;
    }

    heads_[i] = instance->GetHead(i);
    relations_[i] = dictionary.GetLabelAlphabet().Lookup(
      instance->GetDependencyRelation(i));
  }
}
示例#3
0
void EntityTokenDictionary::Initialize(EntityReader *reader) {
  SetTokenDictionaryFlagValues();
  LOG(INFO) << "Creating token dictionary...";

  std::vector<int> form_freqs;
  std::vector<int> form_lower_freqs;
  std::vector<int> shape_freqs;
  std::vector<int> pos_freqs;
  Alphabet form_alphabet;
  Alphabet form_lower_alphabet;
  Alphabet shape_alphabet;
  Alphabet pos_alphabet;

  std::string special_symbols[NUM_SPECIAL_TOKENS];
  special_symbols[TOKEN_UNKNOWN] = kTokenUnknown;
  special_symbols[TOKEN_START] = kTokenStart;
  special_symbols[TOKEN_STOP] = kTokenStop;

  for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
    prefix_alphabet_.Insert(special_symbols[i]);
    suffix_alphabet_.Insert(special_symbols[i]);
    form_alphabet.Insert(special_symbols[i]);
    form_lower_alphabet.Insert(special_symbols[i]);
    shape_alphabet.Insert(special_symbols[i]);
    pos_alphabet.Insert(special_symbols[i]);

    // Counts of special symbols are set to -1:
    form_freqs.push_back(-1);
    form_lower_freqs.push_back(-1);
    shape_freqs.push_back(-1);
    pos_freqs.push_back(-1);
  }

  // Go through the corpus and build the dictionaries,
  // counting the frequencies.
  reader->Open(pipe_->GetOptions()->GetTrainingFilePath());
  EntityInstance *instance =
    static_cast<EntityInstance*>(reader->GetNext());
  while (instance != NULL) {
    int instance_length = instance->size();
    for (int i = 0; i < instance_length; ++i) {
      int id;

      // Add form to alphabet.
      std::string form = instance->GetForm(i);
      std::string form_lower(form);
      std::transform(form_lower.begin(), form_lower.end(),
                     form_lower.begin(), ::tolower);
      if (!form_case_sensitive) form = form_lower;
      id = form_alphabet.Insert(form);
      if (id >= form_freqs.size()) {
        CHECK_EQ(id, form_freqs.size());
        form_freqs.push_back(0);
      }
      ++form_freqs[id];

      // Add lower-case form to the alphabet.
      id = form_lower_alphabet.Insert(form_lower);
      if (id >= form_lower_freqs.size()) {
        CHECK_EQ(id, form_lower_freqs.size());
        form_lower_freqs.push_back(0);
      }
      ++form_lower_freqs[id];

      // Add prefix/suffix to alphabet.
      std::string prefix = form.substr(0, prefix_length);
      id = prefix_alphabet_.Insert(prefix);
      int start = form.length() - suffix_length;
      if (start < 0) start = 0;
      std::string suffix = form.substr(start, suffix_length);
      id = suffix_alphabet_.Insert(suffix);

      // Add shape to alphabet.
      std::string shape;
      GetWordShape(instance->GetForm(i), &shape);
      id = shape_alphabet.Insert(shape);
      if (id >= shape_freqs.size()) {
        CHECK_EQ(id, shape_freqs.size());
        shape_freqs.push_back(0);
      }
      ++shape_freqs[id];

      // Add POS to alphabet.
      id = pos_alphabet.Insert(instance->GetPosTag(i));
      if (id >= pos_freqs.size()) {
        CHECK_EQ(id, pos_freqs.size());
        pos_freqs.push_back(0);
      }
      ++pos_freqs[id];
    }
    delete instance;
    instance = static_cast<EntityInstance*>(reader->GetNext());
  }
  reader->Close();

  // Now adjust the cutoffs if necessary.
  while (true) {
    form_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      form_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = form_alphabet.begin();
    iter != form_alphabet.end();
      ++iter) {
      if (form_freqs[iter->second] > form_cutoff) {
        form_alphabet_.Insert(iter->first);
      }
    }
    if (form_alphabet_.size() < kMaxFormAlphabetSize) break;
    ++form_cutoff;
    LOG(INFO) << "Incrementing form cutoff to " << form_cutoff << "...";
  }

  while (true) {
    form_lower_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      form_lower_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = form_lower_alphabet.begin();
    iter != form_lower_alphabet.end();
      ++iter) {
      if (form_lower_freqs[iter->second] > form_lower_cutoff) {
        form_lower_alphabet_.Insert(iter->first);
      }
    }
    if (form_lower_alphabet_.size() < kMaxFormAlphabetSize) break;
    ++form_lower_cutoff;
    LOG(INFO) << "Incrementing lower-case form cutoff to "
      << form_lower_cutoff << "...";
  }

  while (true) {
    shape_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      shape_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = shape_alphabet.begin();
    iter != shape_alphabet.end();
      ++iter) {
      if (shape_freqs[iter->second] > shape_cutoff) {
        shape_alphabet_.Insert(iter->first);
      }
    }
    if (shape_alphabet_.size() < kMaxShapeAlphabetSize) break;
    ++shape_cutoff;
    LOG(INFO) << "Incrementing shape cutoff to " << shape_cutoff << "...";
  }

  while (true) {
    pos_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      pos_alphabet_.Insert(special_symbols[i]);
    }
    for (const auto& pos_token : pos_alphabet) {
      if (pos_freqs[pos_token.second] > pos_cutoff) {
        pos_alphabet_.Insert(pos_token.first);
      }
    }
    if (pos_alphabet_.size() < kMaxPosAlphabetSize) break;
    ++pos_cutoff;
    LOG(INFO) << "Incrementing POS cutoff to " << pos_cutoff << "...";
  }

  form_alphabet_.StopGrowth();
  form_lower_alphabet_.StopGrowth();
  shape_alphabet_.StopGrowth();
  lemma_alphabet_.StopGrowth();
  prefix_alphabet_.StopGrowth();
  suffix_alphabet_.StopGrowth();
  feats_alphabet_.StopGrowth();
  pos_alphabet_.StopGrowth();
  cpos_alphabet_.StopGrowth();

  LOG(INFO) << "Number of forms: " << form_alphabet_.size() << endl
    << "Number of lower-case forms: " << form_lower_alphabet_.size() << endl
    << "Number of prefixes: " << prefix_alphabet_.size() << endl
    << "Number of suffixes: " << suffix_alphabet_.size() << endl
    << "Number of word shapes: " << shape_alphabet_.size() << endl
    << "Number of pos: " << pos_alphabet_.size();

  CHECK_LT(form_alphabet_.size(), 0xffff);
  CHECK_LT(form_lower_alphabet_.size(), 0xffff);
  CHECK_LT(shape_alphabet_.size(), 0xffff);
  CHECK_LT(lemma_alphabet_.size(), 0xffff);
  CHECK_LT(prefix_alphabet_.size(), 0xffff);
  CHECK_LT(suffix_alphabet_.size(), 0xffff);
  CHECK_LT(feats_alphabet_.size(), 0xffff);
  CHECK_LT(pos_alphabet_.size(), 0xff);
  CHECK_LT(cpos_alphabet_.size(), 0xff);

#ifndef NDEBUG
  BuildNames();
#endif
}