void SequencePipe::MakeSelectedFeatures(Instance *instance, Parts *parts, const vector<bool> &selected_parts, Features *features) { SequenceInstanceNumeric *sentence = static_cast<SequenceInstanceNumeric*>(instance); SequenceFeatures *sequence_features = static_cast<SequenceFeatures*>(features); int sentence_length = sentence->size(); sequence_features->Initialize(instance, parts); // Build features for words only. They will later be conjoined with the tags. for (int i = 0; i < sentence_length; ++i) { sequence_features->AddUnigramFeatures(sentence, i); } if (GetSequenceOptions()->markov_order() >= 1) { for (int i = 0; i < sentence_length + 1; ++i) { sequence_features->AddBigramFeatures(sentence, i); } } if (GetSequenceOptions()->markov_order() >= 2) { for (int i = 1; i < sentence_length + 1; ++i) { sequence_features->AddTrigramFeatures(sentence, i); } } }
void SequencePipe::MakeUnigramParts(Instance *instance, Parts *parts, vector<double> *gold_outputs) { SequenceInstanceNumeric *sentence = static_cast<SequenceInstanceNumeric*>(instance); SequenceParts *sequence_parts = static_cast<SequenceParts*>(parts); SequenceDictionary *sequence_dictionary = GetSequenceDictionary(); SequenceOptions *sequence_options = GetSequenceOptions(); int sentence_length = sentence->size(); bool make_gold = (gold_outputs != NULL); vector<int> all_tags; vector<int> allowed_tags; all_tags.resize(sequence_dictionary->GetTagAlphabet().size()); for (int i = 0; i < all_tags.size(); ++i) { all_tags[i] = i; } int num_parts_initial = sequence_parts->size(); for (int i = 0; i < sentence_length; ++i) { GetAllowedTags(instance, i, &allowed_tags); if (allowed_tags.empty()) { allowed_tags = all_tags; } // Add parts. CHECK_GE(allowed_tags.size(), 0); for (int k = 0; k < allowed_tags.size(); ++k) { int tag = allowed_tags[k]; // Don't create a inigram part if a start/stop bigram is not allowed. if (i == 0 && !sequence_dictionary->IsAllowedBigram(-1, tag)) { continue; } else if (i == sentence_length - 1 && !sequence_dictionary->IsAllowedBigram(tag, -1)) { continue; } Part *part = sequence_parts->CreatePartUnigram(i, tag); sequence_parts->push_back(part); if (make_gold) { if (sentence->GetTagId(i) == tag) { gold_outputs->push_back(1.0); } else { gold_outputs->push_back(0.0); } } } } sequence_parts->SetOffsetUnigram(num_parts_initial, sequence_parts->size() - num_parts_initial); }
void SequencePipe::MakeUnigramParts(Instance *instance, Parts *parts, vector<double> *gold_outputs) { SequenceInstanceNumeric *sentence = static_cast<SequenceInstanceNumeric*>(instance); SequenceParts *sequence_parts = static_cast<SequenceParts*>(parts); SequenceDictionary *sequence_dictionary = GetSequenceDictionary(); SequenceOptions *sequence_options = GetSequenceOptions(); int sentence_length = sentence->size(); bool make_gold = (gold_outputs != NULL); bool prune_tags = sequence_options->prune_tags(); vector<int> all_tags; vector<int> allowed_tags; all_tags.resize(sequence_dictionary->GetTagAlphabet().size()); for (int i = 0; i < all_tags.size(); ++i) { all_tags[i] = i; } int num_parts_initial = sequence_parts->size(); for (int i = 0; i < sentence_length; ++i) { if (prune_tags) { int word_id = sentence->GetFormId(i); allowed_tags = sequence_dictionary->GetWordTags(word_id); // For unknown words, allow all the tags. if (allowed_tags.empty()) { allowed_tags = all_tags; } } else { allowed_tags = all_tags; } // Add parts. CHECK_GE(allowed_tags.size(), 0); for (int k = 0; k < allowed_tags.size(); ++k) { int tag = allowed_tags[k]; Part *part = sequence_parts->CreatePartUnigram(i, tag); sequence_parts->push_back(part); if (make_gold) { if (sentence->GetTagId(i) == tag) { gold_outputs->push_back(1.0); } else { gold_outputs->push_back(0.0); } } } } sequence_parts->SetOffsetUnigram(num_parts_initial, sequence_parts->size() - num_parts_initial); }
void SequencePipe::ComputeScores(Instance *instance, Parts *parts, Features *features, vector<double> *scores) { SequenceInstanceNumeric *sentence = static_cast<SequenceInstanceNumeric*>(instance); SequenceParts *sequence_parts = static_cast<SequenceParts*>(parts); SequenceFeatures *sequence_features = static_cast<SequenceFeatures*>(features); SequenceDictionary *sequence_dictionary = GetSequenceDictionary(); scores->resize(parts->size()); // Compute scores for the unigram parts. for (int i = 0; i < sentence->size(); ++i) { // Conjoin unigram features with the tag. const BinaryFeatures &unigram_features = sequence_features->GetUnigramFeatures(i); const vector<int> &index_unigram_parts = sequence_parts->FindUnigramParts(i); vector<int> allowed_tags(index_unigram_parts.size()); for (int k = 0; k < index_unigram_parts.size(); ++k) { SequencePartUnigram *unigram = static_cast<SequencePartUnigram*>((*parts)[index_unigram_parts[k]]); allowed_tags[k] = unigram->tag(); } vector<double> tag_scores; parameters_->ComputeLabelScores(unigram_features, allowed_tags, &tag_scores); for (int k = 0; k < index_unigram_parts.size(); ++k) { (*scores)[index_unigram_parts[k]] = tag_scores[k]; } } // Compute scores for the bigram parts. if (GetSequenceOptions()->markov_order() >= 1) { for (int i = 0; i < sentence->size() + 1; ++i) { // Conjoin bigram features with the pair of tags. const BinaryFeatures &bigram_features = sequence_features->GetBigramFeatures(i); const vector<int> &index_bigram_parts = sequence_parts->FindBigramParts(i); vector<int> bigram_tags(index_bigram_parts.size()); for (int k = 0; k < index_bigram_parts.size(); ++k) { SequencePartBigram *bigram = static_cast<SequencePartBigram*>((*parts)[index_bigram_parts[k]]); bigram_tags[k] = sequence_dictionary->GetBigramLabel(bigram->tag_left(), bigram->tag()); } vector<double> tag_scores; parameters_->ComputeLabelScores(bigram_features, bigram_tags, &tag_scores); for (int k = 0; k < index_bigram_parts.size(); ++k) { (*scores)[index_bigram_parts[k]] = tag_scores[k]; } } } // Compute scores for the trigram parts. if (GetSequenceOptions()->markov_order() >= 2) { for (int i = 1; i < sentence->size() + 1; ++i) { // Conjoin trigram features with the triple of tags. const BinaryFeatures &trigram_features = sequence_features->GetTrigramFeatures(i); const vector<int> &index_trigram_parts = sequence_parts->FindTrigramParts(i); vector<int> trigram_tags(index_trigram_parts.size()); for (int k = 0; k < index_trigram_parts.size(); ++k) { SequencePartTrigram *trigram = static_cast<SequencePartTrigram*>((*parts)[index_trigram_parts[k]]); trigram_tags[k] = sequence_dictionary->GetTrigramLabel( trigram->tag_left_left(), trigram->tag_left(), trigram->tag()); } vector<double> tag_scores; parameters_->ComputeLabelScores(trigram_features, trigram_tags, &tag_scores); for (int k = 0; k < index_trigram_parts.size(); ++k) { (*scores)[index_trigram_parts[k]] = tag_scores[k]; } } } }
void SequencePipe::MakeTrigramParts(Instance *instance, Parts *parts, vector<double> *gold_outputs) { SequenceInstanceNumeric *sentence = static_cast<SequenceInstanceNumeric*>(instance); SequenceParts *sequence_parts = static_cast<SequenceParts*>(parts); int sentence_length = sentence->size(); bool make_gold = (gold_outputs != NULL); int num_parts_initial = sequence_parts->size(); if (sentence_length <= 1) return; // Start position. const vector<int> &initial_parts = sequence_parts->FindUnigramParts(0); const vector<int> &next_initial_parts = sequence_parts->FindUnigramParts(1); for (int j = 0; j < next_initial_parts.size(); ++j) { SequencePartUnigram *current_part = static_cast<SequencePartUnigram *>( (*sequence_parts)[next_initial_parts[j]]); for (int k = 0; k < initial_parts.size(); ++k) { SequencePartUnigram *previous_part = static_cast<SequencePartUnigram *>( (*sequence_parts)[initial_parts[k]]); Part *part = sequence_parts->CreatePartTrigram(1, current_part->tag(), previous_part->tag(), -1 /* start symbol */); sequence_parts->push_back(part); if (make_gold) { gold_outputs->push_back((*gold_outputs)[next_initial_parts[j]] * (*gold_outputs)[initial_parts[k]]); } } } // Intermediate position. for (int i = 2; i < sentence_length; ++i) { const vector<int> ¤t_parts = sequence_parts->FindUnigramParts(i); const vector<int> &previous_parts = sequence_parts->FindUnigramParts(i - 1); const vector<int> &before_previous_parts = sequence_parts->FindUnigramParts(i - 2); for (int j = 0; j < current_parts.size(); ++j) { SequencePartUnigram *current_part = static_cast<SequencePartUnigram *>( (*sequence_parts)[current_parts[j]]); for (int k = 0; k < previous_parts.size(); ++k) { SequencePartUnigram *previous_part = static_cast<SequencePartUnigram *>( (*sequence_parts)[previous_parts[k]]); for (int l = 0; l < before_previous_parts.size(); ++l) { SequencePartUnigram *before_previous_part = static_cast<SequencePartUnigram *>( (*sequence_parts)[before_previous_parts[l]]); Part *part = sequence_parts->CreatePartTrigram(i, current_part->tag(), previous_part->tag(), before_previous_part->tag()); sequence_parts->push_back(part); if (make_gold) { gold_outputs->push_back( (*gold_outputs)[current_parts[j]] * (*gold_outputs)[previous_parts[k]] * (*gold_outputs)[before_previous_parts[l]]); } } } } } // Final position. const vector<int> &final_parts = sequence_parts->FindUnigramParts(sentence_length - 1); const vector<int> &before_final_parts = sequence_parts->FindUnigramParts(sentence_length - 2); for (int j = 0; j < final_parts.size(); ++j) { SequencePartUnigram *current_part = static_cast<SequencePartUnigram *>( (*sequence_parts)[final_parts[j]]); for (int k = 0; k < before_final_parts.size(); ++k) { SequencePartUnigram *previous_part = static_cast<SequencePartUnigram *>( (*sequence_parts)[before_final_parts[k]]); Part *part = sequence_parts->CreatePartTrigram(sentence_length, -1, /* stop symbol */ current_part->tag(), previous_part->tag()); sequence_parts->push_back(part); if (make_gold) { gold_outputs->push_back((*gold_outputs)[final_parts[j]] * (*gold_outputs)[before_final_parts[k]]); } } } sequence_parts->SetOffsetTrigram(num_parts_initial, sequence_parts->size() - num_parts_initial); }
void SequencePipe::MakeBigramParts(Instance *instance, Parts *parts, vector<double> *gold_outputs) { SequenceInstanceNumeric *sentence = static_cast<SequenceInstanceNumeric*>(instance); SequenceParts *sequence_parts = static_cast<SequenceParts*>(parts); int sentence_length = sentence->size(); bool make_gold = (gold_outputs != NULL); int num_parts_initial = sequence_parts->size(); // Start position. const vector<int> &initial_parts = sequence_parts->FindUnigramParts(0); for (int j = 0; j < initial_parts.size(); ++j) { SequencePartUnigram *initial_part = static_cast<SequencePartUnigram *>( (*sequence_parts)[initial_parts[j]]); Part *part = sequence_parts->CreatePartBigram(0, initial_part->tag(), -1); sequence_parts->push_back(part); if (make_gold) { gold_outputs->push_back((*gold_outputs)[initial_parts[j]]); } } // Intermediate position. for (int i = 1; i < sentence_length; ++i) { const vector<int> ¤t_parts = sequence_parts->FindUnigramParts(i); const vector<int> &previous_parts = sequence_parts->FindUnigramParts(i - 1); for (int j = 0; j < current_parts.size(); ++j) { SequencePartUnigram *current_part = static_cast<SequencePartUnigram *>( (*sequence_parts)[current_parts[j]]); for (int k = 0; k < previous_parts.size(); ++k) { SequencePartUnigram *previous_part = static_cast<SequencePartUnigram *>( (*sequence_parts)[previous_parts[k]]); Part *part = sequence_parts->CreatePartBigram(i, current_part->tag(), previous_part->tag()); sequence_parts->push_back(part); if (make_gold) { gold_outputs->push_back( (*gold_outputs)[current_parts[j]] * (*gold_outputs)[previous_parts[k]]); } } } } // Final position. const vector<int> &final_parts = sequence_parts->FindUnigramParts(sentence_length - 1); for (int j = 0; j < final_parts.size(); ++j) { SequencePartUnigram *final_part = static_cast<SequencePartUnigram *>( (*sequence_parts)[final_parts[j]]); Part *part = sequence_parts->CreatePartBigram(sentence_length, -1, final_part->tag()); sequence_parts->push_back(part); if (make_gold) { gold_outputs->push_back((*gold_outputs)[final_parts[j]]); } } sequence_parts->SetOffsetBigram(num_parts_initial, sequence_parts->size() - num_parts_initial); }