void SequenceInstanceNumeric::Initialize(const SequenceDictionary &dictionary, SequenceInstance* instance) { TokenDictionary *token_dictionary = dictionary.GetTokenDictionary(); int length = instance->size(); int i; int id; int prefix_length = FLAGS_prefix_length; int suffix_length = FLAGS_suffix_length; bool form_case_sensitive = FLAGS_form_case_sensitive; Clear(); form_ids_.resize(length); prefix_ids_.resize(length); suffix_ids_.resize(length); shape_ids_.resize(length); has_digit_.resize(length); has_upper_.resize(length); has_hyphen_.resize(length); all_digits_.resize(length); all_digits_with_punctuation_.resize(length); all_upper_.resize(length); first_upper_.resize(length); tag_ids_.resize(length); for (i = 0; i < length; i++) { std::string form = instance->GetForm(i); if (!form_case_sensitive) { transform(form.begin(), form.end(), form.begin(), ::tolower); } id = token_dictionary->GetFormId(form); CHECK_LT(id, 0xffff); if (id < 0) id = TOKEN_UNKNOWN; form_ids_[i] = id; prefix_ids_[i].resize(prefix_length); for (int l = 0; l < prefix_length; ++l) { std::string prefix = form.substr(0, l + 1); id = token_dictionary->GetPrefixId(prefix); CHECK_LT(id, 0xffff); if (id < 0) id = TOKEN_UNKNOWN; prefix_ids_[i][l] = id; } suffix_ids_[i].resize(suffix_length); for (int l = 0; l < suffix_length; ++l) { int start = form.length() - l - 1; if (start < 0) start = 0; std::string suffix = form.substr(start, l + 1); id = token_dictionary->GetSuffixId(suffix); CHECK_LT(id, 0xffff); if (id < 0) id = TOKEN_UNKNOWN; suffix_ids_[i][l] = id; } // Compute and store the word shape. std::string shape; dictionary.GetTokenDictionary()->GetWordShape(instance->GetForm(i), &shape); int shape_id = dictionary.GetTokenDictionary()->GetShapeId(shape); CHECK_LT(shape_id, 0xffff); if (shape_id < 0) shape_id = kUnknownShape; shape_ids_[i] = shape_id; // Compute and store various flags. const char* word = instance->GetForm(i).c_str(); int word_length = instance->GetForm(i).length(); int num_digits = CountDigits(word, word_length); has_digit_[i] = (num_digits > 0); has_upper_[i] = HasUpperCaseLetters(word, word_length); has_hyphen_[i] = HasHyphen(word, word_length); all_digits_[i] = AllDigits(word, word_length); all_digits_with_punctuation_[i] = AllDigitsWithPunctuation(word, word_length); all_upper_[i] = AllUpperCase(word, word_length); first_upper_[i] = IsCapitalized(word, word_length); //id = token_dictionary->GetPosTagId(instance->GetTag(i)); id = dictionary.GetTagAlphabet().Lookup(instance->GetTag(i)); //CHECK_LT(id, 0xff); //CHECK_GE(id, 0); if (id < 0) { id = TOKEN_UNKNOWN; VLOG(2) << "Unknown tag: " << instance->GetTag(i); } tag_ids_[i] = id; } }
void CoreferenceSentenceNumeric::Initialize( const CoreferenceDictionary &dictionary, CoreferenceSentence* instance, bool add_gold_mentions, std::map<std::string, int> *coreference_span_names) { TokenDictionary *token_dictionary = dictionary.GetTokenDictionary(); DependencyDictionary *dependency_dictionary = dictionary.GetDependencyDictionary(); SemanticDictionary *semantic_dictionary = dictionary.GetSemanticDictionary(); SemanticInstance *semantic_instance = static_cast<SemanticInstance*>(instance); Clear(); SemanticInstanceNumeric::Initialize(*semantic_dictionary, semantic_instance); const std::vector<EntitySpan*> &entity_spans = instance->GetEntitySpans(); entity_spans_.resize(entity_spans.size()); for (int k = 0; k < entity_spans.size(); ++k) { int start = entity_spans[k]->start(); int end = entity_spans[k]->end(); const std::string &name = entity_spans[k]->name(); int id = dictionary.GetEntityAlphabet().Lookup(name); CHECK_LT(id, 0xff); if (id < 0) id = kUnknownEntity; entity_spans_[k] = new NumericSpan(start, end, id); } const std::vector<NamedSpan*> &constituent_spans = instance->GetConstituentSpans(); constituent_spans_.resize(constituent_spans.size()); for (int k = 0; k < constituent_spans.size(); ++k) { int start = constituent_spans[k]->start(); int end = constituent_spans[k]->end(); const std::string &name = constituent_spans[k]->name(); int id = dictionary.GetConstituentAlphabet().Lookup(name); CHECK_LT(id, 0xffff); if (id < 0) id = kUnknownConstituent; constituent_spans_[k] = new NumericSpan(start, end, id); } //std::map<std::string, int> span_names; const std::vector<NamedSpan*> &coreference_spans = instance->GetCoreferenceSpans(); coreference_spans_.resize(coreference_spans.size()); for (int k = 0; k < coreference_spans.size(); ++k) { int start = coreference_spans[k]->start(); int end = coreference_spans[k]->end(); const std::string &name = coreference_spans[k]->name(); std::map<std::string, int>::const_iterator it = coreference_span_names->find(name); int id = -1; if (it == coreference_span_names->end()) { id = coreference_span_names->size(); (*coreference_span_names)[name] = id; } else { id = it->second; } coreference_spans_[k] = new NumericSpan(start, end, id); } int length = instance->size(); first_upper_.resize(length); for (int i = 0; i < length; ++i) { const char* word = instance->GetForm(i).c_str(); int word_length = instance->GetForm(i).length(); first_upper_[i] = IsCapitalized(word, word_length); } // Generate candidate mentions. GenerateMentions(dictionary, instance); //LOG(INFO) << mentions_.size() << " found in sentence."; // Add gold mentions as candidates (only for training). if (add_gold_mentions) AddGoldMentions(dictionary, instance); #if 0 LOG(INFO) << mentions_.size() << " found in sentence."; for (int i = 0; i < mentions_.size(); ++i) { LOG(INFO) << "Mention " << mentions_[i]->start() << " " << mentions_[i]->end(); } #endif #if 0 // Print mention information (for debugging purposes). LOG(INFO) << mentions_.size() << " found in sentence."; for (int i = 0; i < mentions_.size(); ++i) { mentions_[i]->Print(dictionary, instance); } #endif }