コード例 #1
0
void SequencePipe::MakeUnigramParts(Instance *instance,
                                    Parts *parts,
                                    vector<double> *gold_outputs) {
  SequenceInstanceNumeric *sentence =
    static_cast<SequenceInstanceNumeric*>(instance);
  SequenceParts *sequence_parts = static_cast<SequenceParts*>(parts);
  SequenceDictionary *sequence_dictionary = GetSequenceDictionary();
  SequenceOptions *sequence_options = GetSequenceOptions();
  int sentence_length = sentence->size();
  bool make_gold = (gold_outputs != NULL);
  vector<int> all_tags;
  vector<int> allowed_tags;

  all_tags.resize(sequence_dictionary->GetTagAlphabet().size());
  for (int i = 0; i < all_tags.size(); ++i) {
    all_tags[i] = i;
  }

  int num_parts_initial = sequence_parts->size();

  for (int i = 0; i < sentence_length; ++i) {
    GetAllowedTags(instance, i, &allowed_tags);
    if (allowed_tags.empty()) {
      allowed_tags = all_tags;
    }

    // Add parts.
    CHECK_GE(allowed_tags.size(), 0);
    for (int k = 0; k < allowed_tags.size(); ++k) {
      int tag = allowed_tags[k];
      // Don't create a inigram part if a start/stop bigram is not allowed.
      if (i == 0 && !sequence_dictionary->IsAllowedBigram(-1, tag)) {
        continue;
      } else if (i == sentence_length - 1 &&
                 !sequence_dictionary->IsAllowedBigram(tag, -1)) {
        continue;
      }

      Part *part = sequence_parts->CreatePartUnigram(i, tag);
      sequence_parts->push_back(part);
      if (make_gold) {
        if (sentence->GetTagId(i) == tag) {
          gold_outputs->push_back(1.0);
        } else {
          gold_outputs->push_back(0.0);
        }
      }
    }
  }
  sequence_parts->SetOffsetUnigram(num_parts_initial,
      sequence_parts->size() - num_parts_initial);
}
コード例 #2
0
ファイル: SequencePipe.cpp プロジェクト: arunwizz/TurboParser
void SequencePipe::MakeUnigramParts(Instance *instance,
                                    Parts *parts,
                                    vector<double> *gold_outputs) {
  SequenceInstanceNumeric *sentence =
    static_cast<SequenceInstanceNumeric*>(instance);
  SequenceParts *sequence_parts = static_cast<SequenceParts*>(parts);
  SequenceDictionary *sequence_dictionary = GetSequenceDictionary();
  SequenceOptions *sequence_options = GetSequenceOptions();
  int sentence_length = sentence->size();
  bool make_gold = (gold_outputs != NULL);
  bool prune_tags = sequence_options->prune_tags();
  vector<int> all_tags;
  vector<int> allowed_tags;

  all_tags.resize(sequence_dictionary->GetTagAlphabet().size());
  for (int i = 0; i < all_tags.size(); ++i) {
    all_tags[i] = i;
  }

  int num_parts_initial = sequence_parts->size();

  for (int i = 0; i < sentence_length; ++i) {
    if (prune_tags) {
      int word_id = sentence->GetFormId(i);
      allowed_tags = sequence_dictionary->GetWordTags(word_id);
      // For unknown words, allow all the tags.
      if (allowed_tags.empty()) {
        allowed_tags = all_tags;
      }
    } else {
      allowed_tags = all_tags;
    }

    // Add parts.
    CHECK_GE(allowed_tags.size(), 0);
    for (int k = 0; k < allowed_tags.size(); ++k) {
      int tag = allowed_tags[k];
      Part *part = sequence_parts->CreatePartUnigram(i, tag);
      sequence_parts->push_back(part);
      if (make_gold) {
        if (sentence->GetTagId(i) == tag) {
          gold_outputs->push_back(1.0);
        } else {
          gold_outputs->push_back(0.0);
        }
      }
    }
  }
  sequence_parts->SetOffsetUnigram(num_parts_initial,
      sequence_parts->size() - num_parts_initial);
}