コード例 #1
0
ファイル: SequencePipe.cpp プロジェクト: arunwizz/TurboParser
void SequencePipe::MakeSelectedFeatures(Instance *instance,
                                        Parts *parts,
                                        const vector<bool> &selected_parts,
                                        Features *features) {
  SequenceInstanceNumeric *sentence = 
    static_cast<SequenceInstanceNumeric*>(instance);
  SequenceFeatures *sequence_features =
    static_cast<SequenceFeatures*>(features);

  int sentence_length = sentence->size();

  sequence_features->Initialize(instance, parts);

  // Build features for words only. They will later be conjoined with the tags.
  for (int i = 0; i < sentence_length; ++i) {
    sequence_features->AddUnigramFeatures(sentence, i);
  }

  if (GetSequenceOptions()->markov_order() >= 1) {
    for (int i = 0; i < sentence_length + 1; ++i) {
      sequence_features->AddBigramFeatures(sentence, i);
    }
  }

  if (GetSequenceOptions()->markov_order() >= 2) {
    for (int i = 1; i < sentence_length + 1; ++i) {
      sequence_features->AddTrigramFeatures(sentence, i);
    }
  }
}
コード例 #2
0
void SequencePipe::MakeUnigramParts(Instance *instance,
                                    Parts *parts,
                                    vector<double> *gold_outputs) {
  SequenceInstanceNumeric *sentence =
    static_cast<SequenceInstanceNumeric*>(instance);
  SequenceParts *sequence_parts = static_cast<SequenceParts*>(parts);
  SequenceDictionary *sequence_dictionary = GetSequenceDictionary();
  SequenceOptions *sequence_options = GetSequenceOptions();
  int sentence_length = sentence->size();
  bool make_gold = (gold_outputs != NULL);
  vector<int> all_tags;
  vector<int> allowed_tags;

  all_tags.resize(sequence_dictionary->GetTagAlphabet().size());
  for (int i = 0; i < all_tags.size(); ++i) {
    all_tags[i] = i;
  }

  int num_parts_initial = sequence_parts->size();

  for (int i = 0; i < sentence_length; ++i) {
    GetAllowedTags(instance, i, &allowed_tags);
    if (allowed_tags.empty()) {
      allowed_tags = all_tags;
    }

    // Add parts.
    CHECK_GE(allowed_tags.size(), 0);
    for (int k = 0; k < allowed_tags.size(); ++k) {
      int tag = allowed_tags[k];
      // Don't create a inigram part if a start/stop bigram is not allowed.
      if (i == 0 && !sequence_dictionary->IsAllowedBigram(-1, tag)) {
        continue;
      } else if (i == sentence_length - 1 &&
                 !sequence_dictionary->IsAllowedBigram(tag, -1)) {
        continue;
      }

      Part *part = sequence_parts->CreatePartUnigram(i, tag);
      sequence_parts->push_back(part);
      if (make_gold) {
        if (sentence->GetTagId(i) == tag) {
          gold_outputs->push_back(1.0);
        } else {
          gold_outputs->push_back(0.0);
        }
      }
    }
  }
  sequence_parts->SetOffsetUnigram(num_parts_initial,
      sequence_parts->size() - num_parts_initial);
}
コード例 #3
0
ファイル: SequencePipe.cpp プロジェクト: arunwizz/TurboParser
void SequencePipe::MakeUnigramParts(Instance *instance,
                                    Parts *parts,
                                    vector<double> *gold_outputs) {
  SequenceInstanceNumeric *sentence =
    static_cast<SequenceInstanceNumeric*>(instance);
  SequenceParts *sequence_parts = static_cast<SequenceParts*>(parts);
  SequenceDictionary *sequence_dictionary = GetSequenceDictionary();
  SequenceOptions *sequence_options = GetSequenceOptions();
  int sentence_length = sentence->size();
  bool make_gold = (gold_outputs != NULL);
  bool prune_tags = sequence_options->prune_tags();
  vector<int> all_tags;
  vector<int> allowed_tags;

  all_tags.resize(sequence_dictionary->GetTagAlphabet().size());
  for (int i = 0; i < all_tags.size(); ++i) {
    all_tags[i] = i;
  }

  int num_parts_initial = sequence_parts->size();

  for (int i = 0; i < sentence_length; ++i) {
    if (prune_tags) {
      int word_id = sentence->GetFormId(i);
      allowed_tags = sequence_dictionary->GetWordTags(word_id);
      // For unknown words, allow all the tags.
      if (allowed_tags.empty()) {
        allowed_tags = all_tags;
      }
    } else {
      allowed_tags = all_tags;
    }

    // Add parts.
    CHECK_GE(allowed_tags.size(), 0);
    for (int k = 0; k < allowed_tags.size(); ++k) {
      int tag = allowed_tags[k];
      Part *part = sequence_parts->CreatePartUnigram(i, tag);
      sequence_parts->push_back(part);
      if (make_gold) {
        if (sentence->GetTagId(i) == tag) {
          gold_outputs->push_back(1.0);
        } else {
          gold_outputs->push_back(0.0);
        }
      }
    }
  }
  sequence_parts->SetOffsetUnigram(num_parts_initial,
      sequence_parts->size() - num_parts_initial);
}
コード例 #4
0
ファイル: SequencePipe.cpp プロジェクト: arunwizz/TurboParser
void SequencePipe::ComputeScores(Instance *instance, Parts *parts,
                                 Features *features,
                                 vector<double> *scores) {
  SequenceInstanceNumeric *sentence =
    static_cast<SequenceInstanceNumeric*>(instance);
  SequenceParts *sequence_parts = static_cast<SequenceParts*>(parts);
  SequenceFeatures *sequence_features =
    static_cast<SequenceFeatures*>(features);
  SequenceDictionary *sequence_dictionary = GetSequenceDictionary();
  scores->resize(parts->size());

  // Compute scores for the unigram parts.
  for (int i = 0; i < sentence->size(); ++i) {
    // Conjoin unigram features with the tag.
    const BinaryFeatures &unigram_features =
      sequence_features->GetUnigramFeatures(i);

    const vector<int> &index_unigram_parts =
      sequence_parts->FindUnigramParts(i);
    vector<int> allowed_tags(index_unigram_parts.size());
    for (int k = 0; k < index_unigram_parts.size(); ++k) {
      SequencePartUnigram *unigram =
          static_cast<SequencePartUnigram*>((*parts)[index_unigram_parts[k]]);
      allowed_tags[k] = unigram->tag();
    }
    vector<double> tag_scores;
    parameters_->ComputeLabelScores(unigram_features, allowed_tags,
        &tag_scores);
    for (int k = 0; k < index_unigram_parts.size(); ++k) {
      (*scores)[index_unigram_parts[k]] = tag_scores[k];
    }
  }

  // Compute scores for the bigram parts.
  if (GetSequenceOptions()->markov_order() >= 1) {
    for (int i = 0; i < sentence->size() + 1; ++i) {
      // Conjoin bigram features with the pair of tags.
      const BinaryFeatures &bigram_features =
        sequence_features->GetBigramFeatures(i);

      const vector<int> &index_bigram_parts = sequence_parts->FindBigramParts(i);
      vector<int> bigram_tags(index_bigram_parts.size());
      for (int k = 0; k < index_bigram_parts.size(); ++k) {
        SequencePartBigram *bigram =
            static_cast<SequencePartBigram*>((*parts)[index_bigram_parts[k]]);
        bigram_tags[k] = sequence_dictionary->GetBigramLabel(bigram->tag_left(),
                                                             bigram->tag());
      }

      vector<double> tag_scores;
      parameters_->ComputeLabelScores(bigram_features, bigram_tags, &tag_scores);
      for (int k = 0; k < index_bigram_parts.size(); ++k) {
        (*scores)[index_bigram_parts[k]] = tag_scores[k];
      }
    }
  }

  // Compute scores for the trigram parts.
  if (GetSequenceOptions()->markov_order() >= 2) {
    for (int i = 1; i < sentence->size() + 1; ++i) {
      // Conjoin trigram features with the triple of tags.
      const BinaryFeatures &trigram_features =
        sequence_features->GetTrigramFeatures(i);

      const vector<int> &index_trigram_parts = sequence_parts->FindTrigramParts(i);
      vector<int> trigram_tags(index_trigram_parts.size());
      for (int k = 0; k < index_trigram_parts.size(); ++k) {
        SequencePartTrigram *trigram =
            static_cast<SequencePartTrigram*>((*parts)[index_trigram_parts[k]]);
        trigram_tags[k] = sequence_dictionary->GetTrigramLabel(
          trigram->tag_left_left(),
          trigram->tag_left(),
          trigram->tag());
      }

      vector<double> tag_scores;
      parameters_->ComputeLabelScores(trigram_features, trigram_tags, &tag_scores);
      for (int k = 0; k < index_trigram_parts.size(); ++k) {
        (*scores)[index_trigram_parts[k]] = tag_scores[k];
      }
    }
  }
}
コード例 #5
0
ファイル: SequencePipe.cpp プロジェクト: arunwizz/TurboParser
void SequencePipe::MakeTrigramParts(Instance *instance,
                                    Parts *parts,
                                    vector<double> *gold_outputs) {
  SequenceInstanceNumeric *sentence =
    static_cast<SequenceInstanceNumeric*>(instance);
  SequenceParts *sequence_parts = static_cast<SequenceParts*>(parts);
  int sentence_length = sentence->size();
  bool make_gold = (gold_outputs != NULL);

  int num_parts_initial = sequence_parts->size();

  if (sentence_length <= 1) return;

  // Start position.
  const vector<int> &initial_parts = sequence_parts->FindUnigramParts(0);
  const vector<int> &next_initial_parts = sequence_parts->FindUnigramParts(1);
  for (int j = 0; j < next_initial_parts.size(); ++j) {
    SequencePartUnigram *current_part = static_cast<SequencePartUnigram *>(
            (*sequence_parts)[next_initial_parts[j]]);
    for (int k = 0; k < initial_parts.size(); ++k) {
      SequencePartUnigram *previous_part = static_cast<SequencePartUnigram *>(
              (*sequence_parts)[initial_parts[k]]);
      Part *part = sequence_parts->CreatePartTrigram(1,
                                                     current_part->tag(),
                                                     previous_part->tag(), 
                                                     -1 /* start symbol */);
      sequence_parts->push_back(part);
      if (make_gold) {
          gold_outputs->push_back((*gold_outputs)[next_initial_parts[j]] *
                                  (*gold_outputs)[initial_parts[k]]);
      }
    }
  }

  // Intermediate position.
  for (int i = 2; i < sentence_length; ++i) {
    const vector<int> &current_parts = sequence_parts->FindUnigramParts(i);
    const vector<int> &previous_parts = sequence_parts->FindUnigramParts(i - 1);
    const vector<int> &before_previous_parts =
      sequence_parts->FindUnigramParts(i - 2);
    for (int j = 0; j < current_parts.size(); ++j) {
      SequencePartUnigram *current_part = static_cast<SequencePartUnigram *>(
              (*sequence_parts)[current_parts[j]]);
      for (int k = 0; k < previous_parts.size(); ++k) {
        SequencePartUnigram *previous_part = static_cast<SequencePartUnigram *>(
                (*sequence_parts)[previous_parts[k]]);
        for (int l = 0; l < before_previous_parts.size(); ++l) {
          SequencePartUnigram *before_previous_part =
              static_cast<SequencePartUnigram *>(
                  (*sequence_parts)[before_previous_parts[l]]);
          Part *part =
            sequence_parts->CreatePartTrigram(i,
                                             current_part->tag(),
                                             previous_part->tag(),
                                             before_previous_part->tag());
          sequence_parts->push_back(part);
          if (make_gold) {
            gold_outputs->push_back(
                (*gold_outputs)[current_parts[j]] *
                  (*gold_outputs)[previous_parts[k]] *
                    (*gold_outputs)[before_previous_parts[l]]);
          }
        }
      }
    }
  }

  // Final position.
  const vector<int> &final_parts =
      sequence_parts->FindUnigramParts(sentence_length - 1);
  const vector<int> &before_final_parts = 
      sequence_parts->FindUnigramParts(sentence_length - 2);
  for (int j = 0; j < final_parts.size(); ++j) {
    SequencePartUnigram *current_part = static_cast<SequencePartUnigram *>(
            (*sequence_parts)[final_parts[j]]);
    for (int k = 0; k < before_final_parts.size(); ++k) {
      SequencePartUnigram *previous_part = static_cast<SequencePartUnigram *>(
              (*sequence_parts)[before_final_parts[k]]);
      Part *part = sequence_parts->CreatePartTrigram(sentence_length,
                                                     -1, /* stop symbol */
                                                     current_part->tag(),
                                                     previous_part->tag());
      sequence_parts->push_back(part);
      if (make_gold) {
          gold_outputs->push_back((*gold_outputs)[final_parts[j]] *
                                  (*gold_outputs)[before_final_parts[k]]);
      }
    }
  }

  sequence_parts->SetOffsetTrigram(num_parts_initial,
      sequence_parts->size() - num_parts_initial);
}
コード例 #6
0
ファイル: SequencePipe.cpp プロジェクト: arunwizz/TurboParser
void SequencePipe::MakeBigramParts(Instance *instance,
                                   Parts *parts,
                                   vector<double> *gold_outputs) {
  SequenceInstanceNumeric *sentence =
    static_cast<SequenceInstanceNumeric*>(instance);
  SequenceParts *sequence_parts = static_cast<SequenceParts*>(parts);
  int sentence_length = sentence->size();
  bool make_gold = (gold_outputs != NULL);

  int num_parts_initial = sequence_parts->size();

  // Start position.
  const vector<int> &initial_parts = sequence_parts->FindUnigramParts(0);
  for (int j = 0; j < initial_parts.size(); ++j) {
    SequencePartUnigram *initial_part = static_cast<SequencePartUnigram *>(
            (*sequence_parts)[initial_parts[j]]);
    Part *part = sequence_parts->CreatePartBigram(0, initial_part->tag(), -1);
    sequence_parts->push_back(part);
    if (make_gold) {
      gold_outputs->push_back((*gold_outputs)[initial_parts[j]]);
    }
  }

  // Intermediate position.
  for (int i = 1; i < sentence_length; ++i) {
    const vector<int> &current_parts = sequence_parts->FindUnigramParts(i);
    const vector<int> &previous_parts = sequence_parts->FindUnigramParts(i - 1);
    for (int j = 0; j < current_parts.size(); ++j) {
      SequencePartUnigram *current_part = static_cast<SequencePartUnigram *>(
              (*sequence_parts)[current_parts[j]]);
      for (int k = 0; k < previous_parts.size(); ++k) {
        SequencePartUnigram *previous_part = static_cast<SequencePartUnigram *>(
                (*sequence_parts)[previous_parts[k]]);
        Part *part = sequence_parts->CreatePartBigram(i,
                                                      current_part->tag(),
                                                      previous_part->tag());
        sequence_parts->push_back(part);
        if (make_gold) {
          gold_outputs->push_back(
              (*gold_outputs)[current_parts[j]] *
                (*gold_outputs)[previous_parts[k]]);
        }
      }
    }
  }

  // Final position.
  const vector<int> &final_parts =
      sequence_parts->FindUnigramParts(sentence_length - 1);
  for (int j = 0; j < final_parts.size(); ++j) {
    SequencePartUnigram *final_part = static_cast<SequencePartUnigram *>(
            (*sequence_parts)[final_parts[j]]);
    Part *part = sequence_parts->CreatePartBigram(sentence_length,
                                                  -1,
                                                  final_part->tag());
    sequence_parts->push_back(part);
    if (make_gold) {
      gold_outputs->push_back((*gold_outputs)[final_parts[j]]);
    }
  }

  sequence_parts->SetOffsetBigram(num_parts_initial,
      sequence_parts->size() - num_parts_initial);
}