void SequencePipe::MakeUnigramParts(Instance *instance, Parts *parts, vector<double> *gold_outputs) { SequenceInstanceNumeric *sentence = static_cast<SequenceInstanceNumeric*>(instance); SequenceParts *sequence_parts = static_cast<SequenceParts*>(parts); SequenceDictionary *sequence_dictionary = GetSequenceDictionary(); SequenceOptions *sequence_options = GetSequenceOptions(); int sentence_length = sentence->size(); bool make_gold = (gold_outputs != NULL); vector<int> all_tags; vector<int> allowed_tags; all_tags.resize(sequence_dictionary->GetTagAlphabet().size()); for (int i = 0; i < all_tags.size(); ++i) { all_tags[i] = i; } int num_parts_initial = sequence_parts->size(); for (int i = 0; i < sentence_length; ++i) { GetAllowedTags(instance, i, &allowed_tags); if (allowed_tags.empty()) { allowed_tags = all_tags; } // Add parts. CHECK_GE(allowed_tags.size(), 0); for (int k = 0; k < allowed_tags.size(); ++k) { int tag = allowed_tags[k]; // Don't create a inigram part if a start/stop bigram is not allowed. if (i == 0 && !sequence_dictionary->IsAllowedBigram(-1, tag)) { continue; } else if (i == sentence_length - 1 && !sequence_dictionary->IsAllowedBigram(tag, -1)) { continue; } Part *part = sequence_parts->CreatePartUnigram(i, tag); sequence_parts->push_back(part); if (make_gold) { if (sentence->GetTagId(i) == tag) { gold_outputs->push_back(1.0); } else { gold_outputs->push_back(0.0); } } } } sequence_parts->SetOffsetUnigram(num_parts_initial, sequence_parts->size() - num_parts_initial); }
void SequencePipe::MakeUnigramParts(Instance *instance, Parts *parts, vector<double> *gold_outputs) { SequenceInstanceNumeric *sentence = static_cast<SequenceInstanceNumeric*>(instance); SequenceParts *sequence_parts = static_cast<SequenceParts*>(parts); SequenceDictionary *sequence_dictionary = GetSequenceDictionary(); SequenceOptions *sequence_options = GetSequenceOptions(); int sentence_length = sentence->size(); bool make_gold = (gold_outputs != NULL); bool prune_tags = sequence_options->prune_tags(); vector<int> all_tags; vector<int> allowed_tags; all_tags.resize(sequence_dictionary->GetTagAlphabet().size()); for (int i = 0; i < all_tags.size(); ++i) { all_tags[i] = i; } int num_parts_initial = sequence_parts->size(); for (int i = 0; i < sentence_length; ++i) { if (prune_tags) { int word_id = sentence->GetFormId(i); allowed_tags = sequence_dictionary->GetWordTags(word_id); // For unknown words, allow all the tags. if (allowed_tags.empty()) { allowed_tags = all_tags; } } else { allowed_tags = all_tags; } // Add parts. CHECK_GE(allowed_tags.size(), 0); for (int k = 0; k < allowed_tags.size(); ++k) { int tag = allowed_tags[k]; Part *part = sequence_parts->CreatePartUnigram(i, tag); sequence_parts->push_back(part); if (make_gold) { if (sentence->GetTagId(i) == tag) { gold_outputs->push_back(1.0); } else { gold_outputs->push_back(0.0); } } } } sequence_parts->SetOffsetUnigram(num_parts_initial, sequence_parts->size() - num_parts_initial); }