Exemplo n.º 1
0
void TokenDictionary::InitializeFromDependencyReader(DependencyReader *reader) {
  LOG(INFO) << "Creating token dictionary...";

  int form_cutoff = FLAGS_form_cutoff;
  int form_lower_cutoff = FLAGS_form_cutoff;
  int lemma_cutoff = FLAGS_lemma_cutoff;
  int feats_cutoff = FLAGS_feats_cutoff;
  int pos_cutoff = FLAGS_pos_cutoff;
  int cpos_cutoff = FLAGS_cpos_cutoff;
  int prefix_length = FLAGS_prefix_length;
  int suffix_length = FLAGS_suffix_length;
  bool form_case_sensitive = FLAGS_form_case_sensitive;

  vector<int> form_freqs;
  vector<int> form_lower_freqs;
  vector<int> lemma_freqs;
  vector<int> feats_freqs;
  vector<int> pos_freqs;
  vector<int> cpos_freqs;

  Alphabet form_alphabet;
  Alphabet form_lower_alphabet;
  Alphabet lemma_alphabet;
  Alphabet feats_alphabet;
  Alphabet pos_alphabet;
  Alphabet cpos_alphabet;

  string special_symbols[NUM_SPECIAL_TOKENS];
  special_symbols[TOKEN_UNKNOWN] = kTokenUnknown;
  special_symbols[TOKEN_START] = kTokenStart;
  special_symbols[TOKEN_STOP] = kTokenStop;

  for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
    prefix_alphabet_.Insert(special_symbols[i]);
    suffix_alphabet_.Insert(special_symbols[i]);
    form_alphabet.Insert(special_symbols[i]);
    form_lower_alphabet.Insert(special_symbols[i]);
    lemma_alphabet.Insert(special_symbols[i]);
    feats_alphabet.Insert(special_symbols[i]);
    pos_alphabet.Insert(special_symbols[i]);
    cpos_alphabet.Insert(special_symbols[i]);

    // Counts of special symbols are set to -1:
    form_freqs.push_back(-1);
    form_lower_freqs.push_back(-1);
    lemma_freqs.push_back(-1);
    feats_freqs.push_back(-1);
    pos_freqs.push_back(-1);
    cpos_freqs.push_back(-1);
  }

  // Go through the corpus and build the dictionaries,
  // counting the frequencies.
  reader->Open(pipe_->GetOptions()->GetTrainingFilePath());
  DependencyInstance *instance =
    static_cast<DependencyInstance*>(reader->GetNext());
  while (instance != NULL) {
    int instance_length = instance->size();
    for (int i = 0; i < instance_length; ++i) {
      int id;

      // Add form to alphabet.
      std::string form = instance->GetForm(i);
      std::string form_lower(form);
      transform(form_lower.begin(), form_lower.end(), form_lower.begin(),
                ::tolower);
      if (!form_case_sensitive) form = form_lower;
      id = form_alphabet.Insert(form);
      if (id >= form_freqs.size()) {
        CHECK_EQ(id, form_freqs.size());
        form_freqs.push_back(0);
      }
      ++form_freqs[id];

      // Add lower-case form to alphabet.
      id = form_lower_alphabet.Insert(form_lower);
      if (id >= form_lower_freqs.size()) {
        CHECK_EQ(id, form_lower_freqs.size());
        form_lower_freqs.push_back(0);
      }
      ++form_lower_freqs[id];

      // Add lemma to alphabet.
      id = lemma_alphabet.Insert(instance->GetLemma(i));
      if (id >= lemma_freqs.size()) {
        CHECK_EQ(id, lemma_freqs.size());
        lemma_freqs.push_back(0);
      }
      ++lemma_freqs[id];

      // Add prefix/suffix to alphabet.
      // TODO: add varying lengths.
      string prefix = form.substr(0, prefix_length);
      id = prefix_alphabet_.Insert(prefix);
      int start = form.length() - suffix_length;
      if (start < 0) start = 0;
      string suffix = form.substr(start, suffix_length);
      id = suffix_alphabet_.Insert(suffix);

      // Add POS to alphabet.
      id = pos_alphabet.Insert(instance->GetPosTag(i));
      if (id >= pos_freqs.size()) {
        CHECK_EQ(id, pos_freqs.size());
        pos_freqs.push_back(0);
      }
      ++pos_freqs[id];

      // Add CPOS to alphabet.
      id = cpos_alphabet.Insert(instance->GetCoarsePosTag(i));
      if (id >= cpos_freqs.size()) {
        CHECK_EQ(id, cpos_freqs.size());
        cpos_freqs.push_back(0);
      }
      ++cpos_freqs[id];

      // Add FEATS to alphabet.
      for (int j = 0; j < instance->GetNumMorphFeatures(i); ++j) {
        id = feats_alphabet.Insert(instance->GetMorphFeature(i,j));
        if (id >= feats_freqs.size()) {
          CHECK_EQ(id, feats_freqs.size());
          feats_freqs.push_back(0);
        }
        ++feats_freqs[id];
      }
    }
    delete instance;
    instance = static_cast<DependencyInstance*>(reader->GetNext());
  }
  reader->Close();

  // Now adjust the cutoffs if necessary.
  while (true) {
    form_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      form_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = form_alphabet.begin();
         iter != form_alphabet.end(); 
         ++iter) {
      if (form_freqs[iter->second] > form_cutoff) {
        form_alphabet_.Insert(iter->first);
      }
    }
    if (form_alphabet_.size() < kMaxFormAlphabetSize) break;
    ++form_cutoff;
    LOG(INFO) << "Incrementing form cutoff to " << form_cutoff << "...";
  }

  while (true) {
    form_lower_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      form_lower_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = form_lower_alphabet.begin();
         iter != form_lower_alphabet.end();
         ++iter) {
      if (form_lower_freqs[iter->second] > form_lower_cutoff) {
        form_lower_alphabet_.Insert(iter->first);
      }
    }
    if (form_lower_alphabet_.size() < kMaxFormAlphabetSize) break;
    ++form_lower_cutoff;
    LOG(INFO) << "Incrementing lower-case form cutoff to "
              << form_lower_cutoff << "...";
  }

  while (true) {
    lemma_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      lemma_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = lemma_alphabet.begin();
         iter != lemma_alphabet.end();
         ++iter) {
      if (lemma_freqs[iter->second] > lemma_cutoff) {
        lemma_alphabet_.Insert(iter->first);
      }
    }
    if (lemma_alphabet_.size() < kMaxLemmaAlphabetSize) break;
    ++lemma_cutoff;
    LOG(INFO) << "Incrementing lemma cutoff to " << lemma_cutoff << "...";
  }

  while (true) {
    pos_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      pos_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = pos_alphabet.begin();
         iter != pos_alphabet.end(); 
         ++iter) {
      if (pos_freqs[iter->second] > pos_cutoff) {
        pos_alphabet_.Insert(iter->first);
      }
    }
    if (pos_alphabet_.size() < kMaxPosAlphabetSize) break;
    ++pos_cutoff;
    LOG(INFO) << "Incrementing POS cutoff to " << pos_cutoff << "...";
  }

  while (true) {
    cpos_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      cpos_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = cpos_alphabet.begin();
         iter != cpos_alphabet.end(); 
         ++iter) {
      if (cpos_freqs[iter->second] > cpos_cutoff) {
        cpos_alphabet_.Insert(iter->first);
      }
    }
    if (cpos_alphabet_.size() < kMaxCoarsePosAlphabetSize) break;
    ++cpos_cutoff;
    LOG(INFO) << "Incrementing CPOS cutoff to " << cpos_cutoff << "...";
  }

  while (true) {
    feats_alphabet_.clear();
    for (int i = 0; i < NUM_SPECIAL_TOKENS; ++i) {
      feats_alphabet_.Insert(special_symbols[i]);
    }
    for (Alphabet::iterator iter = feats_alphabet.begin();
         iter != feats_alphabet.end(); 
         ++iter) {
      if (feats_freqs[iter->second] > feats_cutoff) {
        feats_alphabet_.Insert(iter->first);
      }
    }
    if (feats_alphabet_.size() < kMaxFeatsAlphabetSize) break;
    ++feats_cutoff;
    LOG(INFO) << "Incrementing FEATS cutoff to " << feats_cutoff << "...";
  }

  form_alphabet_.StopGrowth();
  form_lower_alphabet_.StopGrowth();
  lemma_alphabet_.StopGrowth();
  prefix_alphabet_.StopGrowth();
  suffix_alphabet_.StopGrowth();
  feats_alphabet_.StopGrowth();
  pos_alphabet_.StopGrowth();
  cpos_alphabet_.StopGrowth();

  LOG(INFO) << "Number of forms: " << form_alphabet_.size() << endl
            << "Number of lower-case forms: " << form_lower_alphabet_.size()
            << endl
            << "Number of lemmas: " << lemma_alphabet_.size() << endl
            << "Number of prefixes: " << prefix_alphabet_.size() << endl
            << "Number of suffixes: " << suffix_alphabet_.size() << endl
            << "Number of feats: " << feats_alphabet_.size() << endl
            << "Number of pos: " << pos_alphabet_.size() << endl
            << "Number of cpos: " << cpos_alphabet_.size();

  CHECK_LT(form_alphabet_.size(), 0xffff);
  CHECK_LT(form_lower_alphabet_.size(), 0xffff);
  CHECK_LT(lemma_alphabet_.size(), 0xffff);
  CHECK_LT(prefix_alphabet_.size(), 0xffff);
  CHECK_LT(suffix_alphabet_.size(), 0xffff);
  CHECK_LT(feats_alphabet_.size(), 0xffff);
  CHECK_LT(pos_alphabet_.size(), 0xff);
  CHECK_LT(cpos_alphabet_.size(), 0xff);

  // TODO: Remove this (only for debugging purposes).
  BuildNames();
}
void DependencyDictionary::CreateLabelDictionary(DependencyReader *reader) {
  LOG(INFO) << "Creating label dictionary...";

  vector<int> label_freqs;

  // Go through the corpus and build the label dictionary,
  // counting the frequencies.
  reader->Open(pipe_->GetOptions()->GetTrainingFilePath());
  DependencyInstance *instance = reader->GetNext();
  while (instance != NULL) {
    int instance_length = instance->size();
    for (int i = 1; i < instance_length; ++i) {
      int id;

      // Add dependency label to alphabet.
      id = label_alphabet_.Insert(instance->GetDependencyRelation(i));
      if (id >= label_freqs.size()) {
        CHECK_EQ(id, label_freqs.size());
        label_freqs.push_back(0);
      }
      ++label_freqs[id];
    }
    delete instance;
    instance = reader->GetNext();
  }
  reader->Close();
  label_alphabet_.StopGrowth();

  // Go through the corpus and build the existing labels for each head-modifier
  // POS pair.
  existing_labels_.clear();
  existing_labels_.resize(token_dictionary_->GetNumPosTags(),
                          vector<vector<int> >(
                            token_dictionary_->GetNumPosTags()));

  maximum_left_distances_.clear();
  maximum_left_distances_.resize(token_dictionary_->GetNumPosTags(),
                                 vector<int>(
                                   token_dictionary_->GetNumPosTags(), 0));

  maximum_right_distances_.clear();
  maximum_right_distances_.resize(token_dictionary_->GetNumPosTags(),
                                  vector<int>(
                                    token_dictionary_->GetNumPosTags(), 0));

  reader->Open(pipe_->GetOptions()->GetTrainingFilePath());
  instance = reader->GetNext();
  while (instance != NULL) {
    int instance_length = instance->size();
    for (int i = 1; i < instance_length; ++i) {
      int id;
      int head = instance->GetHead(i);
      const string &modifier_pos = instance->GetPosTag(i);
      const string &head_pos = instance->GetPosTag(head);
      int modifier_pos_id = token_dictionary_->GetPosTagId(modifier_pos);
      int head_pos_id = token_dictionary_->GetPosTagId(head_pos);
      if (modifier_pos_id < 0) modifier_pos_id = TOKEN_UNKNOWN;
      if (head_pos_id < 0) head_pos_id = TOKEN_UNKNOWN;
      //CHECK_GE(modifier_pos_id, 0);
      //CHECK_GE(head_pos_id, 0);

      id = label_alphabet_.Lookup(instance->GetDependencyRelation(i));
      CHECK_GE(id, 0);

      // Insert new label in the set of existing labels, if it is not there
      // already. NOTE: this is inefficient, maybe we should be using a 
      // different data structure.
      vector<int> &labels = existing_labels_[modifier_pos_id][head_pos_id];
      int j;
      for (j = 0; j < labels.size(); ++j) {
        if (labels[j] == id) break;
      }
      if (j == labels.size()) labels.push_back(id);

      // Update the maximum distances if necessary.
      if (head != 0) {
        if (head < i) {
          // Right attachment.
          if (i - head > maximum_right_distances_[modifier_pos_id][head_pos_id]) {
            maximum_right_distances_[modifier_pos_id][head_pos_id] = i - head;
          }
        } else {
          // Left attachment.
          if (head - i > maximum_left_distances_[modifier_pos_id][head_pos_id]) {
            maximum_left_distances_[modifier_pos_id][head_pos_id] = head - i;
          }
        }
      }
    }
    delete instance;
    instance = reader->GetNext();
  }
  reader->Close();

  LOG(INFO) << "Number of labels: " << label_alphabet_.size();
}